diff --git a/.hydra/config.yaml b/.hydra/config.yaml index 26c9a959d752177a989ad79799c0acf6b7192136..5998500bb8ba4de3e370d4eebe7e33bfafe0cf93 100644 --- a/.hydra/config.yaml +++ b/.hydra/config.yaml @@ -81,7 +81,7 @@ train: max_tokens: 2097152000 grad_acc_tokens: 32768 max_grad_norm: 1.0 - gradient_checkpointing: true + gradient_checkpointing: false bias_weight_decay: false normalization_weight_decay: false conv_weight_decay: true diff --git a/checkpoints/step-000000209715200.pt b/checkpoints/step-000000209715200.pt index 8711e08ddce3599a8603cd6328850559ebb9ca01..9508c4669ce97323864082ed7ae28f7599412dff 100644 --- a/checkpoints/step-000000209715200.pt +++ b/checkpoints/step-000000209715200.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3596ad684b7b5c3533050c96dda223ffb053d49e1f227e367f93a911c9451a95 -size 329410370 +oid sha256:a9e45d1cf8fcb47d3de97c6d87e5f89f3999fa51cf1153d98e06ddd01738884a +size 329409794 diff --git a/checkpoints/step-000000419430400.pt b/checkpoints/step-000000419430400.pt index 0136b9d97fb405c75a5068823e4f7025cbbf0f87..0877422c494f20fea2bece421daac636b9ec0baa 100644 --- a/checkpoints/step-000000419430400.pt +++ b/checkpoints/step-000000419430400.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:cd7db428d4fe8e0788e2e1127e026b822b6f87bec105e98d76bc1f4e112ef145 -size 329410370 +oid sha256:76200303f1e014e549314ee020fa0e5d22b5df5ab722b78939721493230cd0e9 +size 329409794 diff --git a/checkpoints/step-000000629145600.pt b/checkpoints/step-000000629145600.pt index d61c1791b6501f443cb2f0c1ededf96f7294fa9a..94c7e9443954d89d6652d0c1660b5f7128154011 100644 --- a/checkpoints/step-000000629145600.pt +++ b/checkpoints/step-000000629145600.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1048a72d60bdefe0fd931d6c2028486a3ec81f5512b21e2990f1f082539eaf01 -size 329410370 +oid sha256:0edce8777efa82db6db58947a86af1e70965bcd8111d157ab482d93509e950ae +size 329409794 diff --git a/checkpoints/step-000000838860800.pt b/checkpoints/step-000000838860800.pt index 347c902d5bd972c204ef6801197da953899b1c34..5c57d9eff4ccd009a335a5cec8222d8989c26310 100644 --- a/checkpoints/step-000000838860800.pt +++ b/checkpoints/step-000000838860800.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:cac0b75d4411b2c94b60cda43b32d38f62140e8533b9d760cdb4b11b7c450573 -size 329410370 +oid sha256:5669d373118b51004bf8aea4fe7a13ee19dbdc68f9e312defe7b150448fe71b1 +size 329409794 diff --git a/checkpoints/step-000001048576000.pt b/checkpoints/step-000001048576000.pt index 3fbae7e7ec0a2d68e012d8ed4607a7238ce252d1..b36cbdfb496d0be57d1fe9601f4ae4422ed52ec7 100644 --- a/checkpoints/step-000001048576000.pt +++ b/checkpoints/step-000001048576000.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:759bd71c4f858504bff61e87b38aa62710d888dc6842a39d5999c2740fae9d57 -size 329410370 +oid sha256:ee4dbd5ba0786230f8dc3da5b8d9004fc0a397ba0c66cb84281ac680baecca2d +size 329409794 diff --git a/checkpoints/step-000001258291200.pt b/checkpoints/step-000001258291200.pt index f7391816b3633d42095ab718bdf306c41ef70e4e..601541c0bc05814ec52ff2499e54218bebbf1fcf 100644 --- a/checkpoints/step-000001258291200.pt +++ b/checkpoints/step-000001258291200.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:305b86ef3a38ba06a63f4734f0bebc4e55cab923fbf9b2adef54d8b0e164f548 -size 329410370 +oid sha256:f97eccdcb9b8e784360b41031ee52a305bbfa2f5f86aded5ce80cd7ba2f8fa26 +size 329409794 diff --git a/checkpoints/step-000001468006400.pt b/checkpoints/step-000001468006400.pt index c9a9180db2134d530e4ed1b11f2a25938011ea6e..989095beefa19bdcdac2d61e611ee7ec5a9387ca 100644 --- a/checkpoints/step-000001468006400.pt +++ b/checkpoints/step-000001468006400.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:88d668ea1e8c49c30d9f8988770c78382d9fff490ee8ef7f1bd9c8cee51efe74 -size 329410370 +oid sha256:a9966465586a4d501839d48e7386d69869fbc6a1c9d5fb1b6f332d41f5b76b2b +size 329409794 diff --git a/checkpoints/step-000001677721600.pt b/checkpoints/step-000001677721600.pt index 413c60070f81a8de4f3a297bd3c9413005ac6630..283246eb3798b554b292476424fcb99a675d5e2a 100644 --- a/checkpoints/step-000001677721600.pt +++ b/checkpoints/step-000001677721600.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:85718ef1df320a10c05a05c1c8e367a7bdfd147dba12228dc68dfdd51d63c461 -size 329410370 +oid sha256:812207ef44485fecc552a69460100dd3edeb92b60f525c24ec4731075d854566 +size 329409794 diff --git a/checkpoints/step-000001887436800.pt b/checkpoints/step-000001887436800.pt index 9d6a4dfc67112291f7faf81a344eddd0593466e2..6ffe1ea4b602ce91f52c225620355e0feefa8cfc 100644 --- a/checkpoints/step-000001887436800.pt +++ b/checkpoints/step-000001887436800.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b8e34b44d8066dd7312fdada84cbb524946011c4ee286a987d3596b7b983fe0d -size 329410370 +oid sha256:12204d8faa3d1b317437b6a40475416449528599d5a3e0eabd4f077a5f4c8544 +size 329409794 diff --git a/config.yaml b/config.yaml index a88da4ce4350ab1979121635ff42e846c08788d3..e901a048786db56e50f61bc2518284181007d412 100644 --- a/config.yaml +++ b/config.yaml @@ -81,7 +81,7 @@ train: max_tokens: 2097152000 grad_acc_tokens: 32768 max_grad_norm: 1.0 - gradient_checkpointing: true + gradient_checkpointing: false bias_weight_decay: false normalization_weight_decay: false conv_weight_decay: true diff --git a/decay_params.txt b/decay_params.txt index 594174bb9c7c453d9bfca41187ccdaf55c0f9b80..da0fadf570ed74e3166b3325c3da23358bfba211 100644 --- a/decay_params.txt +++ b/decay_params.txt @@ -1,14 +1,14 @@ -_forward_module._fsdp_wrapped_module.model.embeddings.weight -_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight -_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight -_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight -_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight -_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight -_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight -_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight -_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight -_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight -_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight -_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight -_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight +_forward_module._fsdp_wrapped_module.emb.weight +_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight +_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight +_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight +_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight +_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight +_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight +_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight +_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight +_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight +_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight +_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight +_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight _forward_module._fsdp_wrapped_module.lm_head.weight diff --git a/logs/2025-10-26_21-16-14.log b/logs/2025-10-26_21-16-14.log new file mode 100644 index 0000000000000000000000000000000000000000..480e42e61331fed8a74eca39c8f56218b97b3aa6 --- /dev/null +++ b/logs/2025-10-26_21-16-14.log @@ -0,0 +1,258 @@ +[2025-10-26 21:16:14][train:372][INFO] All outputs will be saved to `/workspace/forgetting-transformer/alibi_2_4_256` +[2025-10-26 21:16:14][train:375][INFO] Configuration: +[2025-10-26 21:16:14][train:380][INFO] Configuration saved to /workspace/forgetting-transformer/alibi_2_4_256/config.yaml. +[2025-10-26 21:16:14][train:387][INFO] creating datamodule +[2025-10-26 21:16:14][train:419][INFO] creating model +[2025-10-26 21:16:15][train:440][INFO] creating optimizer +[2025-10-26 21:16:15][checkpoint:39][INFO] Not resuming. Deleting existing checkpoints... +[2025-10-26 21:16:15][logger:256][INFO] Setting up wandb logger... +[2025-10-26 21:16:15][logger:272][INFO] Not resuming. Creating a new wandb run. +[2025-10-26 21:16:16][logger:288][INFO] wandb initialized. Run id: pun8f82u +[2025-10-26 21:16:16][logger:186][INFO] Setting up jsonlines logger... +[2025-10-26 21:16:16][logger:113][INFO] Setting up npz logger... +[2025-10-26 21:16:16][logger:171][INFO] [step: 0] [train_data_info/vocab_size: 50277] [train_data_info/global_tokens_per_batch: 2097152] [train_data_info/local_tokens_per_batch: 2097152] [train_data_info/batch_len: 2048] [train_data_info/seq_len: 2048] [train_data_info/total_tokens: 2055208960] [train_data_info/global_batch_size: 1024] [train_data_info/local_batch_size: 1024] +[2025-10-26 21:16:16][logger:171][INFO] [step: 0] [val_data_info/vocab_size: 50277] [val_data_info/global_tokens_per_batch: 2048] [val_data_info/local_tokens_per_batch: 2048] [val_data_info/batch_len: 2048] [val_data_info/seq_len: 2048] [val_data_info/total_tokens: 2147483648] [val_data_info/global_batch_size: 1] [val_data_info/local_batch_size: 1] +[2025-10-26 21:16:16][logger:171][INFO] [step: 0] [model_info/total_params: 27447040] [model_info/trainable_params: 27447040] [model_info/embedding_params: 12870912] [model_info/flops_per_token: 0] [model_info/non_embedding_params: 14576128] +[2025-10-26 21:17:13][utils:57][INFO] [P: 1.00%] [S: 20971520/2097152000] [T: 0:00:57] [ETA: 1:34:15] [loss: 10.077] [tokens/s: 392645.003] [batches/s: 0.187] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-26 21:18:06][utils:57][INFO] [P: 2.00%] [S: 41943040/2097152000] [T: 0:01:50] [ETA: 1:30:15] [loss: 8.170] [tokens/s: 392713.958] [batches/s: 0.187] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-26 21:18:06][train:194][INFO] Running validation... +[2025-10-26 21:19:46][logger:171][INFO] [step: 41943040] [val/train_token_count: 41943040] [val/train_batch_count: 20] [val/train_flop_count: 0] [val/train_total_time: 110.518] [val/train_update_time: 110.195] [val/loss: 8.073] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 99.972] [val/val_tokens_per_second: 409716.276] [val/loss_avg_len_2048: 8.073] [val/perplexity_len_2048: 3205.650] [val/loss_avg_len_1024: 8.071] [val/perplexity_len_1024: 3201.383] [val/loss_avg_len_512: 8.072] [val/perplexity_len_512: 3203.464] +[2025-10-26 21:20:40][utils:57][INFO] [P: 3.00%] [S: 62914560/2097152000] [T: 0:04:23] [ETA: 2:22:11] [loss: 7.760] [tokens/s: 238672.825] [batches/s: 0.114] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-26 21:21:33][utils:57][INFO] [P: 4.00%] [S: 83886080/2097152000] [T: 0:05:17] [ETA: 2:06:53] [loss: 7.535] [tokens/s: 265381.208] [batches/s: 0.127] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-26 21:21:33][train:194][INFO] Running validation... +[2025-10-26 21:23:13][logger:171][INFO] [step: 83886080] [val/train_token_count: 83886080] [val/train_batch_count: 40] [val/train_flop_count: 0] [val/train_total_time: 317.249] [val/train_update_time: 216.731] [val/loss: 7.520] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 99.601] [val/val_tokens_per_second: 411240.277] [val/loss_avg_len_2048: 7.520] [val/perplexity_len_2048: 1844.219] [val/loss_avg_len_1024: 7.521] [val/perplexity_len_1024: 1846.058] [val/loss_avg_len_512: 7.526] [val/perplexity_len_512: 1855.284] +[2025-10-26 21:24:06][utils:57][INFO] [P: 5.00%] [S: 104857600/2097152000] [T: 0:07:50] [ETA: 2:28:54] [loss: 7.356] [tokens/s: 222818.512] [batches/s: 0.106] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-26 21:24:06][logger:171][INFO] [step: 104857600] [train_eval/train_token_count: 104857600] [train_eval/train_batch_count: 50] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 470.240] [train_eval/train_update_time: 270.009] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 8.521] [train_eval/perplexity_len_2048: 5019.001] [train_eval/loss_avg_len_1024: 8.522] [train_eval/perplexity_len_1024: 5026.594] [train_eval/loss_avg_len_512: 8.524] [train_eval/perplexity_len_512: 5034.671] +[2025-10-26 21:24:59][utils:57][INFO] [P: 6.00%] [S: 125829120/2097152000] [T: 0:08:43] [ETA: 2:16:43] [loss: 7.169] [tokens/s: 240455.609] [batches/s: 0.115] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-26 21:24:59][train:194][INFO] Running validation... +[2025-10-26 21:26:39][logger:171][INFO] [step: 125829120] [val/train_token_count: 125829120] [val/train_batch_count: 60] [val/train_flop_count: 0] [val/train_total_time: 523.628] [val/train_update_time: 323.282] [val/loss: 7.165] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 99.442] [val/val_tokens_per_second: 411897.959] [val/loss_avg_len_2048: 7.165] [val/perplexity_len_2048: 1292.821] [val/loss_avg_len_1024: 7.167] [val/perplexity_len_1024: 1295.904] [val/loss_avg_len_512: 7.175] [val/perplexity_len_512: 1306.548] +[2025-10-26 21:27:32][utils:57][INFO] [P: 7.00%] [S: 146800640/2097152000] [T: 0:11:16] [ETA: 2:29:47] [loss: 7.043] [tokens/s: 216814.165] [batches/s: 0.103] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-26 21:28:26][utils:57][INFO] [P: 8.00%] [S: 167772160/2097152000] [T: 0:12:09] [ETA: 2:19:53] [loss: 6.880] [tokens/s: 229852.701] [batches/s: 0.110] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-26 21:28:26][train:194][INFO] Running validation... +[2025-10-26 21:30:06][logger:171][INFO] [step: 167772160] [val/train_token_count: 167772160] [val/train_batch_count: 80] [val/train_flop_count: 0] [val/train_total_time: 729.843] [val/train_update_time: 429.837] [val/loss: 6.866] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 100.848] [val/val_tokens_per_second: 406155.409] [val/loss_avg_len_2048: 6.866] [val/perplexity_len_2048: 959.019] [val/loss_avg_len_1024: 6.870] [val/perplexity_len_1024: 963.405] [val/loss_avg_len_512: 6.883] [val/perplexity_len_512: 975.280] +[2025-10-26 21:31:00][utils:57][INFO] [P: 9.00%] [S: 188743680/2097152000] [T: 0:14:44] [ETA: 2:28:59] [loss: 6.733] [tokens/s: 213304.064] [batches/s: 0.102] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-26 21:31:53][utils:57][INFO] [P: 10.00%] [S: 209715200/2097152000] [T: 0:15:37] [ETA: 2:20:37] [loss: 6.633] [tokens/s: 223627.540] [batches/s: 0.107] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-26 21:31:53][logger:171][INFO] [step: 209715200] [train_eval/train_token_count: 209715200] [train_eval/train_batch_count: 100] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 937.465] [train_eval/train_update_time: 536.388] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 6.966] [train_eval/perplexity_len_2048: 1060.422] [train_eval/loss_avg_len_1024: 6.972] [train_eval/perplexity_len_1024: 1066.603] [train_eval/loss_avg_len_512: 6.982] [train_eval/perplexity_len_512: 1077.249] +[2025-10-26 21:31:53][train:194][INFO] Running validation... +[2025-10-26 21:33:33][logger:171][INFO] [step: 209715200] [val/train_token_count: 209715200] [val/train_batch_count: 100] [val/train_flop_count: 0] [val/train_total_time: 937.465] [val/train_update_time: 536.388] [val/loss: 6.622] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 100.013] [val/val_tokens_per_second: 409544.725] [val/loss_avg_len_2048: 6.622] [val/perplexity_len_2048: 751.358] [val/loss_avg_len_1024: 6.628] [val/perplexity_len_1024: 756.021] [val/loss_avg_len_512: 6.644] [val/perplexity_len_512: 767.964] +[2025-10-26 21:33:33][checkpoint:111][INFO] Saving checkpoint to /workspace/forgetting-transformer/alibi_2_4_256/checkpoints/step-000000209715200.pt... +[2025-10-26 21:33:34][checkpoint:128][INFO] Checkpoint saved to /workspace/forgetting-transformer/alibi_2_4_256/checkpoints/step-000000209715200.pt. +[2025-10-26 21:33:34][logger:171][INFO] [step: 209715200] [checkpoint/checkpoint_time: 0.443] +[2025-10-26 21:34:27][utils:57][INFO] [P: 11.00%] [S: 230686720/2097152000] [T: 0:18:11] [ETA: 2:27:09] [loss: 6.560] [tokens/s: 201797.452] [batches/s: 0.096] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-26 21:35:20][utils:57][INFO] [P: 12.00%] [S: 251658240/2097152000] [T: 0:19:04] [ETA: 2:19:54] [loss: 6.428] [tokens/s: 223522.183] [batches/s: 0.107] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-26 21:35:20][train:194][INFO] Running validation... +[2025-10-26 21:37:01][logger:171][INFO] [step: 251658240] [val/train_token_count: 251658240] [val/train_batch_count: 120] [val/train_flop_count: 0] [val/train_total_time: 1144.691] [val/train_update_time: 642.942] [val/loss: 6.437] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 100.360] [val/val_tokens_per_second: 408129.969] [val/loss_avg_len_2048: 6.437] [val/perplexity_len_2048: 624.344] [val/loss_avg_len_1024: 6.444] [val/perplexity_len_1024: 628.922] [val/loss_avg_len_512: 6.462] [val/perplexity_len_512: 640.135] +[2025-10-26 21:37:54][utils:57][INFO] [P: 13.00%] [S: 272629760/2097152000] [T: 0:21:38] [ETA: 2:24:49] [loss: 6.382] [tokens/s: 201724.230] [batches/s: 0.096] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-26 21:38:48][utils:57][INFO] [P: 14.00%] [S: 293601280/2097152000] [T: 0:22:31] [ETA: 2:18:23] [loss: 6.308] [tokens/s: 223338.996] [batches/s: 0.106] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-26 21:38:48][train:194][INFO] Running validation... +[2025-10-26 21:40:27][logger:171][INFO] [step: 293601280] [val/train_token_count: 293601280] [val/train_batch_count: 140] [val/train_flop_count: 0] [val/train_total_time: 1351.811] [val/train_update_time: 749.497] [val/loss: 6.287] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 99.596] [val/val_tokens_per_second: 411261.156] [val/loss_avg_len_2048: 6.287] [val/perplexity_len_2048: 537.394] [val/loss_avg_len_1024: 6.295] [val/perplexity_len_1024: 541.773] [val/loss_avg_len_512: 6.314] [val/perplexity_len_512: 552.194] +[2025-10-26 21:41:21][utils:57][INFO] [P: 15.00%] [S: 314572800/2097152000] [T: 0:25:04] [ETA: 2:22:07] [loss: 6.193] [tokens/s: 201726.821] [batches/s: 0.096] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-26 21:41:21][logger:171][INFO] [step: 314572800] [train_eval/train_token_count: 314572800] [train_eval/train_batch_count: 150] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 1504.793] [train_eval/train_update_time: 802.770] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 6.409] [train_eval/perplexity_len_2048: 607.101] [train_eval/loss_avg_len_1024: 6.417] [train_eval/perplexity_len_1024: 612.351] [train_eval/loss_avg_len_512: 6.435] [train_eval/perplexity_len_512: 623.146] +[2025-10-26 21:42:14][utils:57][INFO] [P: 16.00%] [S: 335544320/2097152000] [T: 0:25:58] [ETA: 2:16:20] [loss: 6.161] [tokens/s: 223306.638] [batches/s: 0.106] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-26 21:42:14][train:194][INFO] Running validation... +[2025-10-26 21:43:54][logger:171][INFO] [step: 335544320] [val/train_token_count: 335544320] [val/train_batch_count: 160] [val/train_flop_count: 0] [val/train_total_time: 1558.167] [val/train_update_time: 856.038] [val/loss: 6.162] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 99.557] [val/val_tokens_per_second: 411423.777] [val/loss_avg_len_2048: 6.162] [val/perplexity_len_2048: 474.439] [val/loss_avg_len_1024: 6.171] [val/perplexity_len_1024: 478.526] [val/loss_avg_len_512: 6.191] [val/perplexity_len_512: 488.116] +[2025-10-26 21:44:47][utils:57][INFO] [P: 17.00%] [S: 356515840/2097152000] [T: 0:28:31] [ETA: 2:19:14] [loss: 6.076] [tokens/s: 201707.332] [batches/s: 0.096] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-26 21:45:40][utils:57][INFO] [P: 18.00%] [S: 377487360/2097152000] [T: 0:29:24] [ETA: 2:13:58] [loss: 6.050] [tokens/s: 223617.598] [batches/s: 0.107] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-26 21:45:40][train:194][INFO] Running validation... +[2025-10-26 21:47:20][logger:171][INFO] [step: 377487360] [val/train_token_count: 377487360] [val/train_batch_count: 180] [val/train_flop_count: 0] [val/train_total_time: 1764.496] [val/train_update_time: 962.595] [val/loss: 6.044] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 99.502] [val/val_tokens_per_second: 411648.137] [val/loss_avg_len_2048: 6.044] [val/perplexity_len_2048: 421.379] [val/loss_avg_len_1024: 6.053] [val/perplexity_len_1024: 425.264] [val/loss_avg_len_512: 6.074] [val/perplexity_len_512: 434.297] +[2025-10-26 21:48:13][utils:57][INFO] [P: 19.00%] [S: 398458880/2097152000] [T: 0:31:57] [ETA: 2:16:14] [loss: 6.026] [tokens/s: 201973.076] [batches/s: 0.096] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-26 21:49:07][utils:57][INFO] [P: 20.00%] [S: 419430400/2097152000] [T: 0:32:50] [ETA: 2:11:23] [loss: 5.940] [tokens/s: 223854.498] [batches/s: 0.107] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-26 21:49:07][logger:171][INFO] [step: 419430400] [train_eval/train_token_count: 419430400] [train_eval/train_batch_count: 200] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 1970.751] [train_eval/train_update_time: 1069.133] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 6.078] [train_eval/perplexity_len_2048: 436.312] [train_eval/loss_avg_len_1024: 6.089] [train_eval/perplexity_len_1024: 440.880] [train_eval/loss_avg_len_512: 6.108] [train_eval/perplexity_len_512: 449.289] +[2025-10-26 21:49:07][train:194][INFO] Running validation... +[2025-10-26 21:50:46][logger:171][INFO] [step: 419430400] [val/train_token_count: 419430400] [val/train_batch_count: 200] [val/train_flop_count: 0] [val/train_total_time: 1970.751] [val/train_update_time: 1069.133] [val/loss: 5.947] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 99.669] [val/val_tokens_per_second: 410960.915] [val/loss_avg_len_2048: 5.947] [val/perplexity_len_2048: 382.423] [val/loss_avg_len_1024: 5.956] [val/perplexity_len_1024: 385.960] [val/loss_avg_len_512: 5.977] [val/perplexity_len_512: 394.296] +[2025-10-26 21:50:46][checkpoint:111][INFO] Saving checkpoint to /workspace/forgetting-transformer/alibi_2_4_256/checkpoints/step-000000419430400.pt... +[2025-10-26 21:50:47][checkpoint:128][INFO] Checkpoint saved to /workspace/forgetting-transformer/alibi_2_4_256/checkpoints/step-000000419430400.pt. +[2025-10-26 21:50:47][logger:171][INFO] [step: 419430400] [checkpoint/checkpoint_time: 0.435] +[2025-10-26 21:51:40][utils:57][INFO] [P: 21.00%] [S: 440401920/2097152000] [T: 0:35:24] [ETA: 2:13:11] [loss: 5.902] [tokens/s: 202045.969] [batches/s: 0.096] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-26 21:52:33][utils:57][INFO] [P: 22.00%] [S: 461373440/2097152000] [T: 0:36:17] [ETA: 2:08:40] [loss: 5.880] [tokens/s: 223919.256] [batches/s: 0.107] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-26 21:52:33][train:194][INFO] Running validation... +[2025-10-26 21:54:13][logger:171][INFO] [step: 461373440] [val/train_token_count: 461373440] [val/train_batch_count: 220] [val/train_flop_count: 0] [val/train_total_time: 2177.606] [val/train_update_time: 1175.684] [val/loss: 5.860] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 99.702] [val/val_tokens_per_second: 410824.908] [val/loss_avg_len_2048: 5.860] [val/perplexity_len_2048: 350.594] [val/loss_avg_len_1024: 5.869] [val/perplexity_len_1024: 354.067] [val/loss_avg_len_512: 5.892] [val/perplexity_len_512: 362.086] +[2025-10-26 21:55:06][utils:57][INFO] [P: 23.00%] [S: 482344960/2097152000] [T: 0:38:50] [ETA: 2:10:02] [loss: 5.811] [tokens/s: 202177.051] [batches/s: 0.096] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-26 21:56:00][utils:57][INFO] [P: 24.00%] [S: 503316480/2097152000] [T: 0:39:44] [ETA: 2:05:49] [loss: 5.756] [tokens/s: 223897.508] [batches/s: 0.107] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-26 21:56:00][train:194][INFO] Running validation... +[2025-10-26 21:57:39][logger:171][INFO] [step: 503316480] [val/train_token_count: 503316480] [val/train_batch_count: 240] [val/train_flop_count: 0] [val/train_total_time: 2384.052] [val/train_update_time: 1282.224] [val/loss: 5.783] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 99.661] [val/val_tokens_per_second: 410995.065] [val/loss_avg_len_2048: 5.783] [val/perplexity_len_2048: 324.736] [val/loss_avg_len_1024: 5.794] [val/perplexity_len_1024: 328.183] [val/loss_avg_len_512: 5.817] [val/perplexity_len_512: 335.952] +[2025-10-26 21:58:33][utils:57][INFO] [P: 25.00%] [S: 524288000/2097152000] [T: 0:42:17] [ETA: 2:06:51] [loss: 5.763] [tokens/s: 202168.583] [batches/s: 0.096] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-26 21:58:33][logger:171][INFO] [step: 524288000] [train_eval/train_token_count: 524288000] [train_eval/train_batch_count: 250] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 2537.099] [train_eval/train_update_time: 1335.492] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 5.844] [train_eval/perplexity_len_2048: 345.304] [train_eval/loss_avg_len_1024: 5.854] [train_eval/perplexity_len_1024: 348.747] [train_eval/loss_avg_len_512: 5.874] [train_eval/perplexity_len_512: 355.783] +[2025-10-26 21:59:26][utils:57][INFO] [P: 26.00%] [S: 545259520/2097152000] [T: 0:43:10] [ETA: 2:02:52] [loss: 5.718] [tokens/s: 223871.889] [batches/s: 0.107] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-26 21:59:26][train:194][INFO] Running validation... +[2025-10-26 22:01:06][logger:171][INFO] [step: 545259520] [val/train_token_count: 545259520] [val/train_batch_count: 260] [val/train_flop_count: 0] [val/train_total_time: 2590.474] [val/train_update_time: 1388.760] [val/loss: 5.716] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 99.564] [val/val_tokens_per_second: 411393.353] [val/loss_avg_len_2048: 5.716] [val/perplexity_len_2048: 303.822] [val/loss_avg_len_1024: 5.727] [val/perplexity_len_1024: 307.171] [val/loss_avg_len_512: 5.751] [val/perplexity_len_512: 314.628] +[2025-10-26 22:01:59][utils:57][INFO] [P: 27.00%] [S: 566231040/2097152000] [T: 0:45:43] [ETA: 2:03:37] [loss: 5.693] [tokens/s: 202168.506] [batches/s: 0.096] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-26 22:02:53][utils:57][INFO] [P: 28.00%] [S: 587202560/2097152000] [T: 0:46:36] [ETA: 1:59:51] [loss: 5.643] [tokens/s: 223861.520] [batches/s: 0.107] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-26 22:02:53][train:194][INFO] Running validation... +[2025-10-26 22:04:32][logger:171][INFO] [step: 587202560] [val/train_token_count: 587202560] [val/train_batch_count: 280] [val/train_flop_count: 0] [val/train_total_time: 2796.791] [val/train_update_time: 1495.303] [val/loss: 5.650] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 99.505] [val/val_tokens_per_second: 411637.736] [val/loss_avg_len_2048: 5.650] [val/perplexity_len_2048: 284.241] [val/loss_avg_len_1024: 5.661] [val/perplexity_len_1024: 287.545] [val/loss_avg_len_512: 5.686] [val/perplexity_len_512: 294.724] +[2025-10-26 22:05:25][utils:57][INFO] [P: 29.00%] [S: 608174080/2097152000] [T: 0:49:09] [ETA: 2:00:21] [loss: 5.611] [tokens/s: 202167.574] [batches/s: 0.096] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-26 22:06:19][utils:57][INFO] [P: 30.00%] [S: 629145600/2097152000] [T: 0:50:03] [ETA: 1:56:47] [loss: 5.585] [tokens/s: 224004.908] [batches/s: 0.107] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-26 22:06:19][logger:171][INFO] [step: 629145600] [train_eval/train_token_count: 629145600] [train_eval/train_batch_count: 300] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 3003.058] [train_eval/train_update_time: 1601.853] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 5.678] [train_eval/perplexity_len_2048: 292.453] [train_eval/loss_avg_len_1024: 5.688] [train_eval/perplexity_len_1024: 295.384] [train_eval/loss_avg_len_512: 5.709] [train_eval/perplexity_len_512: 301.718] +[2025-10-26 22:06:19][train:194][INFO] Running validation... +[2025-10-26 22:07:58][logger:171][INFO] [step: 629145600] [val/train_token_count: 629145600] [val/train_batch_count: 300] [val/train_flop_count: 0] [val/train_total_time: 3003.058] [val/train_update_time: 1601.853] [val/loss: 5.596] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 99.440] [val/val_tokens_per_second: 411904.901] [val/loss_avg_len_2048: 5.596] [val/perplexity_len_2048: 269.218] [val/loss_avg_len_1024: 5.607] [val/perplexity_len_1024: 272.364] [val/loss_avg_len_512: 5.632] [val/perplexity_len_512: 279.226] +[2025-10-26 22:07:58][checkpoint:111][INFO] Saving checkpoint to /workspace/forgetting-transformer/alibi_2_4_256/checkpoints/step-000000629145600.pt... +[2025-10-26 22:07:59][checkpoint:128][INFO] Checkpoint saved to /workspace/forgetting-transformer/alibi_2_4_256/checkpoints/step-000000629145600.pt. +[2025-10-26 22:07:59][logger:171][INFO] [step: 629145600] [checkpoint/checkpoint_time: 0.449] +[2025-10-26 22:08:52][utils:57][INFO] [P: 31.00%] [S: 650117120/2097152000] [T: 0:52:36] [ETA: 1:57:05] [loss: 5.594] [tokens/s: 202209.727] [batches/s: 0.096] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-26 22:09:45][utils:57][INFO] [P: 32.00%] [S: 671088640/2097152000] [T: 0:53:29] [ETA: 1:53:40] [loss: 5.528] [tokens/s: 223957.251] [batches/s: 0.107] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-26 22:09:45][train:194][INFO] Running validation... +[2025-10-26 22:11:25][logger:171][INFO] [step: 671088640] [val/train_token_count: 671088640] [val/train_batch_count: 320] [val/train_flop_count: 0] [val/train_total_time: 3209.704] [val/train_update_time: 1708.403] [val/loss: 5.543] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 99.426] [val/val_tokens_per_second: 411964.367] [val/loss_avg_len_2048: 5.543] [val/perplexity_len_2048: 255.348] [val/loss_avg_len_1024: 5.554] [val/perplexity_len_1024: 258.396] [val/loss_avg_len_512: 5.580] [val/perplexity_len_512: 264.968] +[2025-10-26 22:12:18][utils:57][INFO] [P: 33.00%] [S: 692060160/2097152000] [T: 0:56:02] [ETA: 1:53:46] [loss: 5.541] [tokens/s: 202263.397] [batches/s: 0.096] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-26 22:13:12][utils:57][INFO] [P: 34.00%] [S: 713031680/2097152000] [T: 0:56:55] [ETA: 1:50:30] [loss: 5.502] [tokens/s: 224011.145] [batches/s: 0.107] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-26 22:13:12][train:194][INFO] Running validation... +[2025-10-26 22:14:53][logger:171][INFO] [step: 713031680] [val/train_token_count: 713031680] [val/train_batch_count: 340] [val/train_flop_count: 0] [val/train_total_time: 3415.885] [val/train_update_time: 1814.959] [val/loss: 5.496] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 101.346] [val/val_tokens_per_second: 404158.233] [val/loss_avg_len_2048: 5.496] [val/perplexity_len_2048: 243.690] [val/loss_avg_len_1024: 5.508] [val/perplexity_len_1024: 246.707] [val/loss_avg_len_512: 5.534] [val/perplexity_len_512: 253.158] +[2025-10-26 22:15:46][utils:57][INFO] [P: 35.00%] [S: 734003200/2097152000] [T: 0:59:30] [ETA: 1:50:31] [loss: 5.483] [tokens/s: 201930.682] [batches/s: 0.096] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-26 22:15:46][logger:171][INFO] [step: 734003200] [train_eval/train_token_count: 734003200] [train_eval/train_batch_count: 350] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 3570.609] [train_eval/train_update_time: 1868.230] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 5.536] [train_eval/perplexity_len_2048: 253.699] [train_eval/loss_avg_len_1024: 5.548] [train_eval/perplexity_len_1024: 256.749] [train_eval/loss_avg_len_512: 5.571] [train_eval/perplexity_len_512: 262.706] +[2025-10-26 22:16:40][utils:57][INFO] [P: 36.00%] [S: 754974720/2097152000] [T: 1:00:23] [ETA: 1:47:22] [loss: 5.410] [tokens/s: 223584.636] [batches/s: 0.107] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-26 22:16:40][train:194][INFO] Running validation... +[2025-10-26 22:18:21][logger:171][INFO] [step: 754974720] [val/train_token_count: 754974720] [val/train_batch_count: 360] [val/train_flop_count: 0] [val/train_total_time: 3623.981] [val/train_update_time: 1921.499] [val/loss: 5.454] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 101.680] [val/val_tokens_per_second: 402830.694] [val/loss_avg_len_2048: 5.454] [val/perplexity_len_2048: 233.736] [val/loss_avg_len_1024: 5.467] [val/perplexity_len_1024: 236.697] [val/loss_avg_len_512: 5.493] [val/perplexity_len_512: 242.938] +[2025-10-26 22:19:15][utils:57][INFO] [P: 37.00%] [S: 775946240/2097152000] [T: 1:02:59] [ETA: 1:47:14] [loss: 5.437] [tokens/s: 201513.247] [batches/s: 0.096] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-26 22:20:08][utils:57][INFO] [P: 38.00%] [S: 796917760/2097152000] [T: 1:03:52] [ETA: 1:44:12] [loss: 5.413] [tokens/s: 223056.690] [batches/s: 0.106] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-26 22:20:08][train:194][INFO] Running validation... +[2025-10-26 22:21:49][logger:171][INFO] [step: 796917760] [val/train_token_count: 796917760] [val/train_batch_count: 380] [val/train_flop_count: 0] [val/train_total_time: 3832.435] [val/train_update_time: 2028.044] [val/loss: 5.414] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 100.824] [val/val_tokens_per_second: 406251.122] [val/loss_avg_len_2048: 5.414] [val/perplexity_len_2048: 224.640] [val/loss_avg_len_1024: 5.427] [val/perplexity_len_1024: 227.492] [val/loss_avg_len_512: 5.453] [val/perplexity_len_512: 233.492] +[2025-10-26 22:22:42][utils:57][INFO] [P: 39.00%] [S: 817889280/2097152000] [T: 1:06:26] [ETA: 1:43:55] [loss: 5.414] [tokens/s: 201250.400] [batches/s: 0.096] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-26 22:23:36][utils:57][INFO] [P: 40.00%] [S: 838860800/2097152000] [T: 1:07:20] [ETA: 1:41:00] [loss: 5.335] [tokens/s: 222827.518] [batches/s: 0.106] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-26 22:23:36][logger:171][INFO] [step: 838860800] [train_eval/train_token_count: 838860800] [train_eval/train_batch_count: 400] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 4040.047] [train_eval/train_update_time: 2134.612] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 5.429] [train_eval/perplexity_len_2048: 227.924] [train_eval/loss_avg_len_1024: 5.441] [train_eval/perplexity_len_1024: 230.568] [train_eval/loss_avg_len_512: 5.464] [train_eval/perplexity_len_512: 235.990] +[2025-10-26 22:23:36][train:194][INFO] Running validation... +[2025-10-26 22:25:15][logger:171][INFO] [step: 838860800] [val/train_token_count: 838860800] [val/train_batch_count: 400] [val/train_flop_count: 0] [val/train_total_time: 4040.047] [val/train_update_time: 2134.612] [val/loss: 5.380] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 99.629] [val/val_tokens_per_second: 411125.218] [val/loss_avg_len_2048: 5.380] [val/perplexity_len_2048: 217.034] [val/loss_avg_len_1024: 5.393] [val/perplexity_len_1024: 219.868] [val/loss_avg_len_512: 5.419] [val/perplexity_len_512: 225.741] +[2025-10-26 22:25:15][checkpoint:111][INFO] Saving checkpoint to /workspace/forgetting-transformer/alibi_2_4_256/checkpoints/step-000000838860800.pt... +[2025-10-26 22:25:16][checkpoint:128][INFO] Checkpoint saved to /workspace/forgetting-transformer/alibi_2_4_256/checkpoints/step-000000838860800.pt. +[2025-10-26 22:25:16][logger:171][INFO] [step: 838860800] [checkpoint/checkpoint_time: 0.453] +[2025-10-26 22:26:09][utils:57][INFO] [P: 41.00%] [S: 859832320/2097152000] [T: 1:09:53] [ETA: 1:40:34] [loss: 5.345] [tokens/s: 201209.907] [batches/s: 0.096] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-26 22:27:03][utils:57][INFO] [P: 42.00%] [S: 880803840/2097152000] [T: 1:10:46] [ETA: 1:37:44] [loss: 5.346] [tokens/s: 222662.081] [batches/s: 0.106] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-26 22:27:03][train:194][INFO] Running validation... +[2025-10-26 22:28:43][logger:171][INFO] [step: 880803840] [val/train_token_count: 880803840] [val/train_batch_count: 420] [val/train_flop_count: 0] [val/train_total_time: 4246.918] [val/train_update_time: 2241.157] [val/loss: 5.350] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 100.115] [val/val_tokens_per_second: 409127.671] [val/loss_avg_len_2048: 5.350] [val/perplexity_len_2048: 210.547] [val/loss_avg_len_1024: 5.363] [val/perplexity_len_1024: 213.278] [val/loss_avg_len_512: 5.389] [val/perplexity_len_512: 218.952] +[2025-10-26 22:29:36][utils:57][INFO] [P: 43.00%] [S: 901775360/2097152000] [T: 1:13:20] [ETA: 1:37:13] [loss: 5.333] [tokens/s: 201068.550] [batches/s: 0.096] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-26 22:30:30][utils:57][INFO] [P: 44.00%] [S: 922746880/2097152000] [T: 1:14:13] [ETA: 1:34:28] [loss: 5.353] [tokens/s: 222950.672] [batches/s: 0.106] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-26 22:30:30][train:194][INFO] Running validation... +[2025-10-26 22:32:11][logger:171][INFO] [step: 922746880] [val/train_token_count: 922746880] [val/train_batch_count: 440] [val/train_flop_count: 0] [val/train_total_time: 4453.812] [val/train_update_time: 2347.708] [val/loss: 5.318] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 101.091] [val/val_tokens_per_second: 405178.307] [val/loss_avg_len_2048: 5.318] [val/perplexity_len_2048: 203.957] [val/loss_avg_len_1024: 5.331] [val/perplexity_len_1024: 206.669] [val/loss_avg_len_512: 5.358] [val/perplexity_len_512: 212.257] +[2025-10-26 22:33:04][utils:57][INFO] [P: 45.00%] [S: 943718400/2097152000] [T: 1:16:48] [ETA: 1:33:52] [loss: 5.297] [tokens/s: 201114.718] [batches/s: 0.096] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-26 22:33:04][logger:171][INFO] [step: 943718400] [train_eval/train_token_count: 943718400] [train_eval/train_batch_count: 450] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 4608.292] [train_eval/train_update_time: 2400.978] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 5.341] [train_eval/perplexity_len_2048: 208.800] [train_eval/loss_avg_len_1024: 5.355] [train_eval/perplexity_len_1024: 211.619] [train_eval/loss_avg_len_512: 5.380] [train_eval/perplexity_len_512: 217.063] +[2025-10-26 22:33:57][utils:57][INFO] [P: 46.00%] [S: 964689920/2097152000] [T: 1:17:41] [ETA: 1:31:12] [loss: 5.275] [tokens/s: 223089.747] [batches/s: 0.106] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-26 22:33:57][train:194][INFO] Running validation... +[2025-10-26 22:35:39][logger:171][INFO] [step: 964689920] [val/train_token_count: 964689920] [val/train_batch_count: 460] [val/train_flop_count: 0] [val/train_total_time: 4661.669] [val/train_update_time: 2454.251] [val/loss: 5.292] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 101.327] [val/val_tokens_per_second: 404234.005] [val/loss_avg_len_2048: 5.292] [val/perplexity_len_2048: 198.670] [val/loss_avg_len_1024: 5.305] [val/perplexity_len_1024: 201.322] [val/loss_avg_len_512: 5.332] [val/perplexity_len_512: 206.791] +[2025-10-26 22:36:32][utils:57][INFO] [P: 47.00%] [S: 985661440/2097152000] [T: 1:20:16] [ETA: 1:30:31] [loss: 5.291] [tokens/s: 201181.011] [batches/s: 0.096] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-26 22:37:26][utils:57][INFO] [P: 48.00%] [S: 1006632960/2097152000] [T: 1:21:09] [ETA: 1:27:55] [loss: 5.264] [tokens/s: 222967.653] [batches/s: 0.106] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-26 22:37:26][train:194][INFO] Running validation... +[2025-10-26 22:39:07][logger:171][INFO] [step: 1006632960] [val/train_token_count: 1006632960] [val/train_batch_count: 480] [val/train_flop_count: 0] [val/train_total_time: 4869.772] [val/train_update_time: 2560.803] [val/loss: 5.267] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 101.225] [val/val_tokens_per_second: 404642.453] [val/loss_avg_len_2048: 5.267] [val/perplexity_len_2048: 193.870] [val/loss_avg_len_1024: 5.281] [val/perplexity_len_1024: 196.476] [val/loss_avg_len_512: 5.308] [val/perplexity_len_512: 201.870] +[2025-10-26 22:40:00][utils:57][INFO] [P: 49.00%] [S: 1027604480/2097152000] [T: 1:23:44] [ETA: 1:27:09] [loss: 5.269] [tokens/s: 201109.121] [batches/s: 0.096] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-26 22:40:54][utils:57][INFO] [P: 50.00%] [S: 1048576000/2097152000] [T: 1:24:37] [ETA: 1:24:37] [loss: 5.245] [tokens/s: 222703.660] [batches/s: 0.106] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-26 22:40:54][logger:171][INFO] [step: 1048576000] [train_eval/train_token_count: 1048576000] [train_eval/train_batch_count: 500] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 5077.749] [train_eval/train_update_time: 2667.341] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 5.274] [train_eval/perplexity_len_2048: 195.220] [train_eval/loss_avg_len_1024: 5.287] [train_eval/perplexity_len_1024: 197.736] [train_eval/loss_avg_len_512: 5.313] [train_eval/perplexity_len_512: 203.031] +[2025-10-26 22:40:54][train:194][INFO] Running validation... +[2025-10-26 22:42:35][logger:171][INFO] [step: 1048576000] [val/train_token_count: 1048576000] [val/train_batch_count: 500] [val/train_flop_count: 0] [val/train_total_time: 5077.749] [val/train_update_time: 2667.341] [val/loss: 5.244] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 101.128] [val/val_tokens_per_second: 405031.633] [val/loss_avg_len_2048: 5.244] [val/perplexity_len_2048: 189.484] [val/loss_avg_len_1024: 5.258] [val/perplexity_len_1024: 192.096] [val/loss_avg_len_512: 5.286] [val/perplexity_len_512: 197.462] +[2025-10-26 22:42:35][checkpoint:111][INFO] Saving checkpoint to /workspace/forgetting-transformer/alibi_2_4_256/checkpoints/step-000001048576000.pt... +[2025-10-26 22:42:35][checkpoint:128][INFO] Checkpoint saved to /workspace/forgetting-transformer/alibi_2_4_256/checkpoints/step-000001048576000.pt. +[2025-10-26 22:42:35][logger:171][INFO] [step: 1048576000] [checkpoint/checkpoint_time: 0.443] +[2025-10-26 22:43:28][utils:57][INFO] [P: 51.00%] [S: 1069547520/2097152000] [T: 1:27:12] [ETA: 1:23:47] [loss: 5.242] [tokens/s: 200822.919] [batches/s: 0.096] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-26 22:44:22][utils:57][INFO] [P: 52.00%] [S: 1090519040/2097152000] [T: 1:28:06] [ETA: 1:21:19] [loss: 5.235] [tokens/s: 222358.851] [batches/s: 0.106] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-26 22:44:22][train:194][INFO] Running validation... +[2025-10-26 22:46:04][logger:171][INFO] [step: 1090519040] [val/train_token_count: 1090519040] [val/train_batch_count: 520] [val/train_flop_count: 0] [val/train_total_time: 5286.093] [val/train_update_time: 2773.886] [val/loss: 5.224] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 101.726] [val/val_tokens_per_second: 402649.191] [val/loss_avg_len_2048: 5.224] [val/perplexity_len_2048: 185.696] [val/loss_avg_len_1024: 5.238] [val/perplexity_len_1024: 188.252] [val/loss_avg_len_512: 5.265] [val/perplexity_len_512: 193.484] +[2025-10-26 22:46:57][utils:57][INFO] [P: 53.00%] [S: 1111490560/2097152000] [T: 1:30:41] [ETA: 1:20:25] [loss: 5.193] [tokens/s: 200512.953] [batches/s: 0.096] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-26 22:47:50][utils:57][INFO] [P: 54.00%] [S: 1132462080/2097152000] [T: 1:31:34] [ETA: 1:18:00] [loss: 5.209] [tokens/s: 222204.047] [batches/s: 0.106] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-26 22:47:50][train:194][INFO] Running validation... +[2025-10-26 22:49:32][logger:171][INFO] [step: 1132462080] [val/train_token_count: 1132462080] [val/train_batch_count: 540] [val/train_flop_count: 0] [val/train_total_time: 5494.619] [val/train_update_time: 2880.433] [val/loss: 5.204] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 101.620] [val/val_tokens_per_second: 403068.537] [val/loss_avg_len_2048: 5.204] [val/perplexity_len_2048: 182.063] [val/loss_avg_len_1024: 5.218] [val/perplexity_len_1024: 184.597] [val/loss_avg_len_512: 5.246] [val/perplexity_len_512: 189.768] +[2025-10-26 22:50:25][utils:57][INFO] [P: 55.00%] [S: 1153433600/2097152000] [T: 1:34:09] [ETA: 1:17:02] [loss: 5.156] [tokens/s: 200408.069] [batches/s: 0.096] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-26 22:50:25][logger:171][INFO] [step: 1153433600] [train_eval/train_token_count: 1153433600] [train_eval/train_batch_count: 550] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 5649.622] [train_eval/train_update_time: 2933.706] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 5.217] [train_eval/perplexity_len_2048: 184.366] [train_eval/loss_avg_len_1024: 5.227] [train_eval/perplexity_len_1024: 186.300] [train_eval/loss_avg_len_512: 5.251] [train_eval/perplexity_len_512: 190.837] +[2025-10-26 22:51:19][utils:57][INFO] [P: 56.00%] [S: 1174405120/2097152000] [T: 1:35:03] [ETA: 1:14:40] [loss: 5.195] [tokens/s: 222130.660] [batches/s: 0.106] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-26 22:51:19][train:194][INFO] Running validation... +[2025-10-26 22:52:59][logger:171][INFO] [step: 1174405120] [val/train_token_count: 1174405120] [val/train_batch_count: 560] [val/train_flop_count: 0] [val/train_total_time: 5703.017] [val/train_update_time: 2986.992] [val/loss: 5.188] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 100.531] [val/val_tokens_per_second: 407436.220] [val/loss_avg_len_2048: 5.188] [val/perplexity_len_2048: 179.085] [val/loss_avg_len_1024: 5.202] [val/perplexity_len_1024: 181.607] [val/loss_avg_len_512: 5.230] [val/perplexity_len_512: 186.735] +[2025-10-26 22:53:53][utils:57][INFO] [P: 57.00%] [S: 1195376640/2097152000] [T: 1:37:36] [ETA: 1:13:38] [loss: 5.166] [tokens/s: 200559.622] [batches/s: 0.096] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-26 22:54:46][utils:57][INFO] [P: 58.00%] [S: 1216348160/2097152000] [T: 1:38:30] [ETA: 1:11:19] [loss: 5.197] [tokens/s: 222297.302] [batches/s: 0.106] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-26 22:54:46][train:194][INFO] Running validation... +[2025-10-26 22:56:26][logger:171][INFO] [step: 1216348160] [val/train_token_count: 1216348160] [val/train_batch_count: 580] [val/train_flop_count: 0] [val/train_total_time: 5910.319] [val/train_update_time: 3093.534] [val/loss: 5.172] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 100.040] [val/val_tokens_per_second: 409434.268] [val/loss_avg_len_2048: 5.172] [val/perplexity_len_2048: 176.189] [val/loss_avg_len_1024: 5.186] [val/perplexity_len_1024: 178.683] [val/loss_avg_len_512: 5.214] [val/perplexity_len_512: 183.758] +[2025-10-26 22:57:20][utils:57][INFO] [P: 59.00%] [S: 1237319680/2097152000] [T: 1:41:03] [ETA: 1:10:13] [loss: 5.171] [tokens/s: 200785.912] [batches/s: 0.096] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-26 22:58:13][utils:57][INFO] [P: 60.00%] [S: 1258291200/2097152000] [T: 1:41:57] [ETA: 1:07:58] [loss: 5.175] [tokens/s: 222658.190] [batches/s: 0.106] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-26 22:58:13][logger:171][INFO] [step: 1258291200] [train_eval/train_token_count: 1258291200] [train_eval/train_batch_count: 600] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 6117.131] [train_eval/train_update_time: 3200.081] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 5.172] [train_eval/perplexity_len_2048: 176.260] [train_eval/loss_avg_len_1024: 5.182] [train_eval/perplexity_len_1024: 178.088] [train_eval/loss_avg_len_512: 5.207] [train_eval/perplexity_len_512: 182.607] +[2025-10-26 22:58:13][train:194][INFO] Running validation... +[2025-10-26 22:59:53][logger:171][INFO] [step: 1258291200] [val/train_token_count: 1258291200] [val/train_batch_count: 600] [val/train_flop_count: 0] [val/train_total_time: 6117.131] [val/train_update_time: 3200.081] [val/loss: 5.157] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 99.893] [val/val_tokens_per_second: 410039.290] [val/loss_avg_len_2048: 5.157] [val/perplexity_len_2048: 173.723] [val/loss_avg_len_1024: 5.172] [val/perplexity_len_1024: 176.211] [val/loss_avg_len_512: 5.200] [val/perplexity_len_512: 181.238] +[2025-10-26 22:59:53][checkpoint:111][INFO] Saving checkpoint to /workspace/forgetting-transformer/alibi_2_4_256/checkpoints/step-000001258291200.pt... +[2025-10-26 22:59:53][checkpoint:128][INFO] Checkpoint saved to /workspace/forgetting-transformer/alibi_2_4_256/checkpoints/step-000001258291200.pt. +[2025-10-26 22:59:53][logger:171][INFO] [step: 1258291200] [checkpoint/checkpoint_time: 0.449] +[2025-10-26 23:00:47][utils:57][INFO] [P: 61.00%] [S: 1279262720/2097152000] [T: 1:44:30] [ETA: 1:06:49] [loss: 5.161] [tokens/s: 201017.630] [batches/s: 0.096] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-26 23:01:40][utils:57][INFO] [P: 62.00%] [S: 1300234240/2097152000] [T: 1:45:24] [ETA: 1:04:36] [loss: 5.136] [tokens/s: 222982.480] [batches/s: 0.106] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-26 23:01:40][train:194][INFO] Running validation... +[2025-10-26 23:03:20][logger:171][INFO] [step: 1300234240] [val/train_token_count: 1300234240] [val/train_batch_count: 620] [val/train_flop_count: 0] [val/train_total_time: 6324.271] [val/train_update_time: 3306.647] [val/loss: 5.144] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 100.145] [val/val_tokens_per_second: 409008.036] [val/loss_avg_len_2048: 5.144] [val/perplexity_len_2048: 171.381] [val/loss_avg_len_1024: 5.158] [val/perplexity_len_1024: 173.834] [val/loss_avg_len_512: 5.186] [val/perplexity_len_512: 178.815] +[2025-10-26 23:04:14][utils:57][INFO] [P: 63.00%] [S: 1321205760/2097152000] [T: 1:47:57] [ETA: 1:03:24] [loss: 5.146] [tokens/s: 201329.619] [batches/s: 0.096] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-26 23:05:07][utils:57][INFO] [P: 64.00%] [S: 1342177280/2097152000] [T: 1:48:51] [ETA: 1:01:13] [loss: 5.153] [tokens/s: 223342.819] [batches/s: 0.106] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-26 23:05:07][train:194][INFO] Running validation... +[2025-10-26 23:06:47][logger:171][INFO] [step: 1342177280] [val/train_token_count: 1342177280] [val/train_batch_count: 640] [val/train_flop_count: 0] [val/train_total_time: 6531.186] [val/train_update_time: 3413.191] [val/loss: 5.131] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 99.948] [val/val_tokens_per_second: 409812.777] [val/loss_avg_len_2048: 5.131] [val/perplexity_len_2048: 169.188] [val/loss_avg_len_1024: 5.145] [val/perplexity_len_1024: 171.645] [val/loss_avg_len_512: 5.174] [val/perplexity_len_512: 176.611] +[2025-10-26 23:07:40][utils:57][INFO] [P: 65.00%] [S: 1363148800/2097152000] [T: 1:51:24] [ETA: 0:59:59] [loss: 5.122] [tokens/s: 201658.399] [batches/s: 0.096] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-26 23:07:40][logger:171][INFO] [step: 1363148800] [train_eval/train_token_count: 1363148800] [train_eval/train_batch_count: 650] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 6684.524] [train_eval/train_update_time: 3466.462] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 5.145] [train_eval/perplexity_len_2048: 171.577] [train_eval/loss_avg_len_1024: 5.160] [train_eval/perplexity_len_1024: 174.117] [train_eval/loss_avg_len_512: 5.185] [train_eval/perplexity_len_512: 178.545] +[2025-10-26 23:08:34][utils:57][INFO] [P: 66.00%] [S: 1384120320/2097152000] [T: 1:52:17] [ETA: 0:57:51] [loss: 5.144] [tokens/s: 223482.439] [batches/s: 0.107] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-26 23:08:34][train:194][INFO] Running validation... +[2025-10-26 23:10:13][logger:171][INFO] [step: 1384120320] [val/train_token_count: 1384120320] [val/train_batch_count: 660] [val/train_flop_count: 0] [val/train_total_time: 6737.915] [val/train_update_time: 3519.732] [val/loss: 5.120] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 99.735] [val/val_tokens_per_second: 410687.485] [val/loss_avg_len_2048: 5.120] [val/perplexity_len_2048: 167.401] [val/loss_avg_len_1024: 5.135] [val/perplexity_len_1024: 169.836] [val/loss_avg_len_512: 5.163] [val/perplexity_len_512: 174.755] +[2025-10-26 23:11:07][utils:57][INFO] [P: 67.00%] [S: 1405091840/2097152000] [T: 1:54:51] [ETA: 0:56:34] [loss: 5.096] [tokens/s: 201810.947] [batches/s: 0.096] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-26 23:12:00][utils:57][INFO] [P: 68.00%] [S: 1426063360/2097152000] [T: 1:55:44] [ETA: 0:54:27] [loss: 5.104] [tokens/s: 223547.884] [batches/s: 0.107] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-26 23:12:00][train:194][INFO] Running validation... +[2025-10-26 23:13:40][logger:171][INFO] [step: 1426063360] [val/train_token_count: 1426063360] [val/train_batch_count: 680] [val/train_flop_count: 0] [val/train_total_time: 6944.454] [val/train_update_time: 3626.288] [val/loss: 5.111] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 99.924] [val/val_tokens_per_second: 409912.887] [val/loss_avg_len_2048: 5.111] [val/perplexity_len_2048: 165.787] [val/loss_avg_len_1024: 5.125] [val/perplexity_len_1024: 168.201] [val/loss_avg_len_512: 5.154] [val/perplexity_len_512: 173.115] +[2025-10-26 23:14:34][utils:57][INFO] [P: 69.00%] [S: 1447034880/2097152000] [T: 1:58:17] [ETA: 0:53:08] [loss: 5.114] [tokens/s: 201831.092] [batches/s: 0.096] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-26 23:15:27][utils:57][INFO] [P: 70.00%] [S: 1468006400/2097152000] [T: 1:59:11] [ETA: 0:51:04] [loss: 5.101] [tokens/s: 223651.229] [batches/s: 0.107] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-26 23:15:27][logger:171][INFO] [step: 1468006400] [train_eval/train_token_count: 1468006400] [train_eval/train_batch_count: 700] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 7151.144] [train_eval/train_update_time: 3732.833] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 5.112] [train_eval/perplexity_len_2048: 166.072] [train_eval/loss_avg_len_1024: 5.126] [train_eval/perplexity_len_1024: 168.424] [train_eval/loss_avg_len_512: 5.155] [train_eval/perplexity_len_512: 173.216] +[2025-10-26 23:15:27][train:194][INFO] Running validation... +[2025-10-26 23:17:07][logger:171][INFO] [step: 1468006400] [val/train_token_count: 1468006400] [val/train_batch_count: 700] [val/train_flop_count: 0] [val/train_total_time: 7151.144] [val/train_update_time: 3732.833] [val/loss: 5.101] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 99.963] [val/val_tokens_per_second: 409749.591] [val/loss_avg_len_2048: 5.101] [val/perplexity_len_2048: 164.254] [val/loss_avg_len_1024: 5.116] [val/perplexity_len_1024: 166.669] [val/loss_avg_len_512: 5.145] [val/perplexity_len_512: 171.560] +[2025-10-26 23:17:07][checkpoint:111][INFO] Saving checkpoint to /workspace/forgetting-transformer/alibi_2_4_256/checkpoints/step-000001468006400.pt... +[2025-10-26 23:17:07][checkpoint:128][INFO] Checkpoint saved to /workspace/forgetting-transformer/alibi_2_4_256/checkpoints/step-000001468006400.pt. +[2025-10-26 23:17:07][logger:171][INFO] [step: 1468006400] [checkpoint/checkpoint_time: 0.452] +[2025-10-26 23:18:01][utils:57][INFO] [P: 71.00%] [S: 1488977920/2097152000] [T: 2:01:44] [ETA: 0:49:43] [loss: 5.092] [tokens/s: 201822.272] [batches/s: 0.096] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-26 23:18:54][utils:57][INFO] [P: 72.00%] [S: 1509949440/2097152000] [T: 2:02:38] [ETA: 0:47:41] [loss: 5.101] [tokens/s: 223589.767] [batches/s: 0.107] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-26 23:18:54][train:194][INFO] Running validation... +[2025-10-26 23:20:34][logger:171][INFO] [step: 1509949440] [val/train_token_count: 1509949440] [val/train_batch_count: 720] [val/train_flop_count: 0] [val/train_total_time: 7358.337] [val/train_update_time: 3839.386] [val/loss: 5.093] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 99.891] [val/val_tokens_per_second: 410045.618] [val/loss_avg_len_2048: 5.093] [val/perplexity_len_2048: 162.938] [val/loss_avg_len_1024: 5.108] [val/perplexity_len_1024: 165.350] [val/loss_avg_len_512: 5.137] [val/perplexity_len_512: 170.232] +[2025-10-26 23:21:27][utils:57][INFO] [P: 73.00%] [S: 1530920960/2097152000] [T: 2:05:11] [ETA: 0:46:18] [loss: 5.109] [tokens/s: 201870.189] [batches/s: 0.096] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-26 23:22:21][utils:57][INFO] [P: 74.00%] [S: 1551892480/2097152000] [T: 2:06:05] [ETA: 0:44:17] [loss: 5.105] [tokens/s: 223595.012] [batches/s: 0.107] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-26 23:22:21][train:194][INFO] Running validation... +[2025-10-26 23:24:01][logger:171][INFO] [step: 1551892480] [val/train_token_count: 1551892480] [val/train_batch_count: 740] [val/train_flop_count: 0] [val/train_total_time: 7565.031] [val/train_update_time: 3945.941] [val/loss: 5.086] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 99.777] [val/val_tokens_per_second: 410516.517] [val/loss_avg_len_2048: 5.086] [val/perplexity_len_2048: 161.804] [val/loss_avg_len_1024: 5.101] [val/perplexity_len_1024: 164.202] [val/loss_avg_len_512: 5.130] [val/perplexity_len_512: 169.052] +[2025-10-26 23:24:54][utils:57][INFO] [P: 75.00%] [S: 1572864000/2097152000] [T: 2:08:38] [ETA: 0:42:52] [loss: 5.097] [tokens/s: 201891.999] [batches/s: 0.096] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-26 23:24:54][logger:171][INFO] [step: 1572864000] [train_eval/train_token_count: 1572864000] [train_eval/train_batch_count: 750] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 7718.234] [train_eval/train_update_time: 3999.225] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 5.092] [train_eval/perplexity_len_2048: 162.718] [train_eval/loss_avg_len_1024: 5.107] [train_eval/perplexity_len_1024: 165.165] [train_eval/loss_avg_len_512: 5.134] [train_eval/perplexity_len_512: 169.733] +[2025-10-26 23:25:47][utils:57][INFO] [P: 76.00%] [S: 1593835520/2097152000] [T: 2:09:31] [ETA: 0:40:54] [loss: 5.053] [tokens/s: 223573.837] [batches/s: 0.107] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-26 23:25:47][train:194][INFO] Running validation... +[2025-10-26 23:27:27][logger:171][INFO] [step: 1593835520] [val/train_token_count: 1593835520] [val/train_batch_count: 760] [val/train_flop_count: 0] [val/train_total_time: 7771.637] [val/train_update_time: 4052.504] [val/loss: 5.080] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 99.857] [val/val_tokens_per_second: 410185.320] [val/loss_avg_len_2048: 5.080] [val/perplexity_len_2048: 160.844] [val/loss_avg_len_1024: 5.095] [val/perplexity_len_1024: 163.238] [val/loss_avg_len_512: 5.124] [val/perplexity_len_512: 168.074] +[2025-10-26 23:28:21][utils:57][INFO] [P: 77.00%] [S: 1614807040/2097152000] [T: 2:12:04] [ETA: 0:39:27] [loss: 5.113] [tokens/s: 201867.208] [batches/s: 0.096] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-26 23:29:14][utils:57][INFO] [P: 78.00%] [S: 1635778560/2097152000] [T: 2:12:58] [ETA: 0:37:30] [loss: 5.037] [tokens/s: 223591.734] [batches/s: 0.107] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-26 23:29:14][train:194][INFO] Running validation... +[2025-10-26 23:30:56][logger:171][INFO] [step: 1635778560] [val/train_token_count: 1635778560] [val/train_batch_count: 780] [val/train_flop_count: 0] [val/train_total_time: 7978.290] [val/train_update_time: 4159.058] [val/loss: 5.075] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 101.504] [val/val_tokens_per_second: 403532.450] [val/loss_avg_len_2048: 5.075] [val/perplexity_len_2048: 159.990] [val/loss_avg_len_1024: 5.090] [val/perplexity_len_1024: 162.369] [val/loss_avg_len_512: 5.119] [val/perplexity_len_512: 167.195] +[2025-10-26 23:31:49][utils:57][INFO] [P: 79.00%] [S: 1656750080/2097152000] [T: 2:15:33] [ETA: 0:36:01] [loss: 5.071] [tokens/s: 201551.794] [batches/s: 0.096] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-26 23:32:42][utils:57][INFO] [P: 80.00%] [S: 1677721600/2097152000] [T: 2:16:26] [ETA: 0:34:06] [loss: 5.039] [tokens/s: 223322.014] [batches/s: 0.106] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-26 23:32:42][logger:171][INFO] [step: 1677721600] [train_eval/train_token_count: 1677721600] [train_eval/train_batch_count: 800] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 8186.597] [train_eval/train_update_time: 4265.605] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 5.077] [train_eval/perplexity_len_2048: 160.213] [train_eval/loss_avg_len_1024: 5.092] [train_eval/perplexity_len_1024: 162.774] [train_eval/loss_avg_len_512: 5.120] [train_eval/perplexity_len_512: 167.277] +[2025-10-26 23:32:42][train:194][INFO] Running validation... +[2025-10-26 23:34:24][logger:171][INFO] [step: 1677721600] [val/train_token_count: 1677721600] [val/train_batch_count: 800] [val/train_flop_count: 0] [val/train_total_time: 8186.597] [val/train_update_time: 4265.605] [val/loss: 5.070] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 101.590] [val/val_tokens_per_second: 403191.092] [val/loss_avg_len_2048: 5.070] [val/perplexity_len_2048: 159.243] [val/loss_avg_len_1024: 5.085] [val/perplexity_len_1024: 161.622] [val/loss_avg_len_512: 5.115] [val/perplexity_len_512: 166.439] +[2025-10-26 23:34:24][checkpoint:111][INFO] Saving checkpoint to /workspace/forgetting-transformer/alibi_2_4_256/checkpoints/step-000001677721600.pt... +[2025-10-26 23:34:24][checkpoint:128][INFO] Checkpoint saved to /workspace/forgetting-transformer/alibi_2_4_256/checkpoints/step-000001677721600.pt. +[2025-10-26 23:34:24][logger:171][INFO] [step: 1677721600] [checkpoint/checkpoint_time: 0.434] +[2025-10-26 23:35:18][utils:57][INFO] [P: 81.00%] [S: 1698693120/2097152000] [T: 2:19:02] [ETA: 0:32:36] [loss: 5.020] [tokens/s: 201236.804] [batches/s: 0.096] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-26 23:36:11][utils:57][INFO] [P: 82.00%] [S: 1719664640/2097152000] [T: 2:19:55] [ETA: 0:30:42] [loss: 5.034] [tokens/s: 222808.847] [batches/s: 0.106] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-26 23:36:11][train:194][INFO] Running validation... +[2025-10-26 23:37:53][logger:171][INFO] [step: 1719664640] [val/train_token_count: 1719664640] [val/train_batch_count: 820] [val/train_flop_count: 0] [val/train_total_time: 8395.407] [val/train_update_time: 4372.160] [val/loss: 5.066] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 101.431] [val/val_tokens_per_second: 403819.955] [val/loss_avg_len_2048: 5.066] [val/perplexity_len_2048: 158.613] [val/loss_avg_len_1024: 5.081] [val/perplexity_len_1024: 160.994] [val/loss_avg_len_512: 5.111] [val/perplexity_len_512: 165.808] +[2025-10-26 23:38:46][utils:57][INFO] [P: 83.00%] [S: 1740636160/2097152000] [T: 2:22:30] [ETA: 0:29:11] [loss: 5.062] [tokens/s: 200933.563] [batches/s: 0.096] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-26 23:39:39][utils:57][INFO] [P: 84.00%] [S: 1761607680/2097152000] [T: 2:23:23] [ETA: 0:27:18] [loss: 5.039] [tokens/s: 222414.750] [batches/s: 0.106] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-26 23:39:39][train:194][INFO] Running validation... +[2025-10-26 23:41:19][logger:171][INFO] [step: 1761607680] [val/train_token_count: 1761607680] [val/train_batch_count: 840] [val/train_flop_count: 0] [val/train_total_time: 8603.637] [val/train_update_time: 4478.716] [val/loss: 5.063] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 99.801] [val/val_tokens_per_second: 410416.185] [val/loss_avg_len_2048: 5.063] [val/perplexity_len_2048: 158.094] [val/loss_avg_len_1024: 5.078] [val/perplexity_len_1024: 160.470] [val/loss_avg_len_512: 5.108] [val/perplexity_len_512: 165.272] +[2025-10-26 23:42:13][utils:57][INFO] [P: 85.00%] [S: 1782579200/2097152000] [T: 2:25:56] [ETA: 0:25:45] [loss: 5.099] [tokens/s: 200938.684] [batches/s: 0.096] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-26 23:42:13][logger:171][INFO] [step: 1782579200] [train_eval/train_token_count: 1782579200] [train_eval/train_batch_count: 850] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 8756.834] [train_eval/train_update_time: 4531.993] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 5.061] [train_eval/perplexity_len_2048: 157.686] [train_eval/loss_avg_len_1024: 5.071] [train_eval/perplexity_len_1024: 159.400] [train_eval/loss_avg_len_512: 5.099] [train_eval/perplexity_len_512: 163.821] +[2025-10-26 23:43:06][utils:57][INFO] [P: 86.00%] [S: 1803550720/2097152000] [T: 2:26:50] [ETA: 0:23:54] [loss: 5.075] [tokens/s: 222436.658] [batches/s: 0.106] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-26 23:43:06][train:194][INFO] Running validation... +[2025-10-26 23:44:46][logger:171][INFO] [step: 1803550720] [val/train_token_count: 1803550720] [val/train_batch_count: 860] [val/train_flop_count: 0] [val/train_total_time: 8810.230] [val/train_update_time: 4585.267] [val/loss: 5.061] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 99.596] [val/val_tokens_per_second: 411261.912] [val/loss_avg_len_2048: 5.061] [val/perplexity_len_2048: 157.677] [val/loss_avg_len_1024: 5.075] [val/perplexity_len_1024: 160.050] [val/loss_avg_len_512: 5.105] [val/perplexity_len_512: 164.844] +[2025-10-26 23:45:39][utils:57][INFO] [P: 87.00%] [S: 1824522240/2097152000] [T: 2:29:23] [ETA: 0:22:19] [loss: 5.019] [tokens/s: 200990.678] [batches/s: 0.096] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-26 23:46:32][utils:57][INFO] [P: 88.00%] [S: 1845493760/2097152000] [T: 2:30:16] [ETA: 0:20:29] [loss: 5.056] [tokens/s: 222896.705] [batches/s: 0.106] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-26 23:46:32][train:194][INFO] Running validation... +[2025-10-26 23:48:12][logger:171][INFO] [step: 1845493760] [val/train_token_count: 1845493760] [val/train_batch_count: 880] [val/train_flop_count: 0] [val/train_total_time: 9016.607] [val/train_update_time: 4691.817] [val/loss: 5.058] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 99.554] [val/val_tokens_per_second: 411433.791] [val/loss_avg_len_2048: 5.058] [val/perplexity_len_2048: 157.352] [val/loss_avg_len_1024: 5.073] [val/perplexity_len_1024: 159.725] [val/loss_avg_len_512: 5.103] [val/perplexity_len_512: 164.520] +[2025-10-26 23:49:05][utils:57][INFO] [P: 89.00%] [S: 1866465280/2097152000] [T: 2:32:49] [ETA: 0:18:53] [loss: 5.094] [tokens/s: 201373.463] [batches/s: 0.096] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-26 23:49:59][utils:57][INFO] [P: 90.00%] [S: 1887436800/2097152000] [T: 2:33:42] [ETA: 0:17:04] [loss: 5.027] [tokens/s: 223490.653] [batches/s: 0.107] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-26 23:49:59][logger:171][INFO] [step: 1887436800] [train_eval/train_token_count: 1887436800] [train_eval/train_batch_count: 900] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 9222.956] [train_eval/train_update_time: 4798.368] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 5.061] [train_eval/perplexity_len_2048: 157.725] [train_eval/loss_avg_len_1024: 5.075] [train_eval/perplexity_len_1024: 160.002] [train_eval/loss_avg_len_512: 5.102] [train_eval/perplexity_len_512: 164.418] +[2025-10-26 23:49:59][train:194][INFO] Running validation... +[2025-10-26 23:51:39][logger:171][INFO] [step: 1887436800] [val/train_token_count: 1887436800] [val/train_batch_count: 900] [val/train_flop_count: 0] [val/train_total_time: 9222.956] [val/train_update_time: 4798.368] [val/loss: 5.057] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 100.031] [val/val_tokens_per_second: 409473.595] [val/loss_avg_len_2048: 5.057] [val/perplexity_len_2048: 157.117] [val/loss_avg_len_1024: 5.072] [val/perplexity_len_1024: 159.489] [val/loss_avg_len_512: 5.102] [val/perplexity_len_512: 164.282] +[2025-10-26 23:51:39][checkpoint:111][INFO] Saving checkpoint to /workspace/forgetting-transformer/alibi_2_4_256/checkpoints/step-000001887436800.pt... +[2025-10-26 23:51:39][checkpoint:128][INFO] Checkpoint saved to /workspace/forgetting-transformer/alibi_2_4_256/checkpoints/step-000001887436800.pt. +[2025-10-26 23:51:39][logger:171][INFO] [step: 1887436800] [checkpoint/checkpoint_time: 0.442] +[2025-10-26 23:52:33][utils:57][INFO] [P: 91.00%] [S: 1908408320/2097152000] [T: 2:36:16] [ETA: 0:15:27] [loss: 5.049] [tokens/s: 201672.909] [batches/s: 0.096] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-26 23:53:26][utils:57][INFO] [P: 92.00%] [S: 1929379840/2097152000] [T: 2:37:10] [ETA: 0:13:40] [loss: 5.089] [tokens/s: 223717.151] [batches/s: 0.107] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-26 23:53:26][train:194][INFO] Running validation... +[2025-10-26 23:55:06][logger:171][INFO] [step: 1929379840] [val/train_token_count: 1929379840] [val/train_batch_count: 920] [val/train_flop_count: 0] [val/train_total_time: 9430.235] [val/train_update_time: 4904.913] [val/loss: 5.056] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 100.070] [val/val_tokens_per_second: 409313.027] [val/loss_avg_len_2048: 5.056] [val/perplexity_len_2048: 156.924] [val/loss_avg_len_1024: 5.071] [val/perplexity_len_1024: 159.296] [val/loss_avg_len_512: 5.100] [val/perplexity_len_512: 164.083] +[2025-10-26 23:55:59][utils:57][INFO] [P: 93.00%] [S: 1950351360/2097152000] [T: 2:39:43] [ETA: 0:12:01] [loss: 5.060] [tokens/s: 201942.074] [batches/s: 0.096] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-26 23:56:53][utils:57][INFO] [P: 94.00%] [S: 1971322880/2097152000] [T: 2:40:37] [ETA: 0:10:15] [loss: 5.021] [tokens/s: 223658.926] [batches/s: 0.107] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-26 23:56:53][train:194][INFO] Running validation... +[2025-10-26 23:58:33][logger:171][INFO] [step: 1971322880] [val/train_token_count: 1971322880] [val/train_batch_count: 940] [val/train_flop_count: 0] [val/train_total_time: 9637.073] [val/train_update_time: 5011.441] [val/loss: 5.055] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 100.583] [val/val_tokens_per_second: 407224.794] [val/loss_avg_len_2048: 5.055] [val/perplexity_len_2048: 156.817] [val/loss_avg_len_1024: 5.070] [val/perplexity_len_1024: 159.188] [val/loss_avg_len_512: 5.100] [val/perplexity_len_512: 163.978] +[2025-10-26 23:59:27][utils:57][INFO] [P: 95.00%] [S: 1992294400/2097152000] [T: 2:43:11] [ETA: 0:08:35] [loss: 5.055] [tokens/s: 201792.021] [batches/s: 0.096] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-26 23:59:27][logger:171][INFO] [step: 1992294400] [train_eval/train_token_count: 1992294400] [train_eval/train_batch_count: 950] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 9791.052] [train_eval/train_update_time: 5064.716] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 5.052] [train_eval/perplexity_len_2048: 156.309] [train_eval/loss_avg_len_1024: 5.068] [train_eval/perplexity_len_1024: 158.884] [train_eval/loss_avg_len_512: 5.095] [train_eval/perplexity_len_512: 163.199] +[2025-10-27 00:00:20][utils:57][INFO] [P: 96.00%] [S: 2013265920/2097152000] [T: 2:44:04] [ETA: 0:06:50] [loss: 5.061] [tokens/s: 223422.133] [batches/s: 0.107] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-27 00:00:20][train:194][INFO] Running validation... +[2025-10-27 00:02:00][logger:171][INFO] [step: 2013265920] [val/train_token_count: 2013265920] [val/train_batch_count: 960] [val/train_flop_count: 0] [val/train_total_time: 9844.444] [val/train_update_time: 5117.988] [val/loss: 5.055] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 99.617] [val/val_tokens_per_second: 411173.096] [val/loss_avg_len_2048: 5.055] [val/perplexity_len_2048: 156.755] [val/loss_avg_len_1024: 5.070] [val/perplexity_len_1024: 159.125] [val/loss_avg_len_512: 5.099] [val/perplexity_len_512: 163.911] +[2025-10-27 00:02:53][utils:57][INFO] [P: 97.00%] [S: 2034237440/2097152000] [T: 2:46:37] [ETA: 0:05:09] [loss: 5.073] [tokens/s: 201787.348] [batches/s: 0.096] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-27 00:03:47][utils:57][INFO] [P: 98.00%] [S: 2055208960/2097152000] [T: 2:47:30] [ETA: 0:03:25] [loss: 5.058] [tokens/s: 223406.851] [batches/s: 0.107] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-27 00:03:47][train:194][INFO] Running validation... +[2025-10-27 00:05:26][logger:171][INFO] [step: 2055208960] [val/train_token_count: 2055208960] [val/train_batch_count: 980] [val/train_flop_count: 0] [val/train_total_time: 10050.842] [val/train_update_time: 5224.531] [val/loss: 5.055] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 99.766] [val/val_tokens_per_second: 410561.508] [val/loss_avg_len_2048: 5.055] [val/perplexity_len_2048: 156.726] [val/loss_avg_len_1024: 5.070] [val/perplexity_len_1024: 159.097] [val/loss_avg_len_512: 5.099] [val/perplexity_len_512: 163.883] +[2025-10-27 00:05:26][train:854][INFO] Training finished with 2055208960 tokens! diff --git a/metrics/jsonlines/checkpoint.jsonl b/metrics/jsonlines/checkpoint.jsonl index da73e08149e76c9ceb7cd0f20c31a2e6c64a1281..5549bf1c5185abbb7f8f046d72bc8008f5ee8a7a 100644 --- a/metrics/jsonlines/checkpoint.jsonl +++ b/metrics/jsonlines/checkpoint.jsonl @@ -1,9 +1,9 @@ -{"step": 209715200, "checkpoint/checkpoint_time": 0.41898034600308165} -{"step": 419430400, "checkpoint/checkpoint_time": 0.4345024300273508} -{"step": 629145600, "checkpoint/checkpoint_time": 0.4318052630405873} -{"step": 838860800, "checkpoint/checkpoint_time": 0.41630766697926447} -{"step": 1048576000, "checkpoint/checkpoint_time": 0.42021259502507746} -{"step": 1258291200, "checkpoint/checkpoint_time": 0.4185596199822612} -{"step": 1468006400, "checkpoint/checkpoint_time": 0.42701432603644207} -{"step": 1677721600, "checkpoint/checkpoint_time": 0.41897460201289505} -{"step": 1887436800, "checkpoint/checkpoint_time": 0.42052310600411147} +{"step": 209715200, "checkpoint/checkpoint_time": 0.44336505798855796} +{"step": 419430400, "checkpoint/checkpoint_time": 0.43483636603923514} +{"step": 629145600, "checkpoint/checkpoint_time": 0.44907815201440826} +{"step": 838860800, "checkpoint/checkpoint_time": 0.45288487296784297} +{"step": 1048576000, "checkpoint/checkpoint_time": 0.442782363970764} +{"step": 1258291200, "checkpoint/checkpoint_time": 0.4494084370089695} +{"step": 1468006400, "checkpoint/checkpoint_time": 0.4516124309739098} +{"step": 1677721600, "checkpoint/checkpoint_time": 0.43384581699501723} +{"step": 1887436800, "checkpoint/checkpoint_time": 0.4421905800118111} diff --git a/metrics/jsonlines/norm.jsonl b/metrics/jsonlines/norm.jsonl index b3fed02c10f4741433d663c637863be3f76a4575..176c3486a3e5209ca675b30ad80adb8ae7e69fb9 100644 --- a/metrics/jsonlines/norm.jsonl +++ b/metrics/jsonlines/norm.jsonl @@ -1,98 +1,98 @@ -{"step": 20971520, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 71.96881103515625, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.04931756854057312, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 15.992537498474121, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.001354195992462337, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 5.17136812210083, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.005113133694976568, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 5.1492719650268555, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.005361487157642841, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.148346900939941, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.03569701686501503, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.157134056091309, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.04859068989753723, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 15.987136840820312, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.0012969253584742546, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 12.615348815917969, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.042890001088380814, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 8.929548263549805, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.04455192759633064, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 16.001537322998047, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.0014995726523920894, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 5.157523155212402, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.00249459408223629, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 5.185312747955322, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.002509487560018897, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.166274547576904, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.07670692354440689, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.143224239349365, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.1674599051475525, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 16.005922317504883, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.002554683480411768, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 12.627293586730957, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.06328214704990387, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 8.90603256225586, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.07561046630144119, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 16.017423629760742, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.06265711784362793, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 72.25432586669922, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.9707998037338257} -{"step": 41943040, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 73.00874328613281, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.0454106442630291, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 15.99699592590332, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.0011847913265228271, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 5.457751750946045, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.007312293630093336, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 5.446045398712158, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.0047170789912343025, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.281281471252441, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.024224666878581047, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.282985687255859, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.02802717685699463, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 15.979215621948242, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.0007766256458126009, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 12.957906723022461, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.021227773278951645, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 9.175869941711426, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.022060437127947807, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 16.006433486938477, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.0005141213769093156, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 5.340758323669434, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.0021936874836683273, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 5.3706746101379395, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.0022677304223179817, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.303279399871826, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.017137344926595688, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.284801006317139, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.03909695893526077, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 15.983871459960938, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.0012422859435901046, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 12.90445613861084, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.02115129865705967, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 9.110078811645508, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.028933899477124214, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 16.15167236328125, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.07554862648248672, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 80.06192779541016, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.9559218287467957} -{"step": 62914560, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 74.06412506103516, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.14560027420520782, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 16.025676727294922, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.006493757478892803, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 5.736934185028076, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.024993088096380234, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 5.785252094268799, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.01461393665522337, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.387188911437988, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.1426747739315033, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.378756999969482, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.20323581993579865, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 15.988592147827148, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.012646134942770004, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 13.171303749084473, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.2682206332683563, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 9.33739948272705, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.24660450220108032, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 16.001752853393555, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.012365789152681828, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 5.451326370239258, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.013072025962173939, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 5.480714321136475, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.012566701509058475, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.373334884643555, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.16836988925933838, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.36562442779541, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.13005593419075012, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 15.949560165405273, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.003467398229986429, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 13.050443649291992, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.09993962943553925, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 9.23079776763916, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.07670039683580399, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 16.29753875732422, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.01719731278717518, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 89.52503967285156, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.21639637649059296} -{"step": 83886080, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 75.31282806396484, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.1636553704738617, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 16.041820526123047, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.007256134878844023, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 5.84732723236084, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.037294164299964905, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 5.921545505523682, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.025870652869343758, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.467042446136475, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.15088491141796112, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.449705123901367, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.1598309725522995, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 16.002822875976562, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.008238726295530796, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 13.2990083694458, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.12890076637268066, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 9.436783790588379, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.13208335638046265, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 15.990812301635742, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.006102641113102436, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 5.475791931152344, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.017084632068872452, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 5.492623329162598, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.018234234303236008, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.390182971954346, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.12115395069122314, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.391382217407227, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.09408573061227798, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 15.919923782348633, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.004328988958150148, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 13.12956428527832, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.08783417195081711, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 9.288615226745605, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.05823551490902901, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 16.415800094604492, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.020610297098755836, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 96.92670440673828, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.17371150851249695} -{"step": 104857600, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 76.76067352294922, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.10974003374576569, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 16.053102493286133, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.004903715569525957, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 5.933706283569336, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.023936495184898376, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 6.015637397766113, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.018652038648724556, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.532181739807129, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.10124502331018448, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.51088285446167, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.10389731824398041, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 16.01064682006836, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.004180931486189365, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 13.401520729064941, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.072423055768013, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 9.518519401550293, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.07662675529718399, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 15.983317375183105, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.0032914725597947836, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 5.494405269622803, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.011478925123810768, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 5.5041351318359375, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.009417949244379997, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.402820587158203, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.0708518922328949, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.411530494689941, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.05026652291417122, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 15.890864372253418, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.0020234219264239073, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 13.180601119995117, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.05218706279993057, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 9.323247909545898, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.03306099399924278, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 16.52654266357422, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.025772524997591972, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 102.65660858154297, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.12112192064523697} -{"step": 125829120, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 78.26143646240234, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.14764313399791718, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 16.06210708618164, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.005774522665888071, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 6.019710063934326, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.023557797074317932, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 6.107157230377197, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.021521462127566338, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.57767915725708, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.13689817488193512, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.553013324737549, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.14891035854816437, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 16.018550872802734, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.007017810828983784, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 13.500693321228027, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.12979170680046082, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 9.597635269165039, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.14138919115066528, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 15.972907066345215, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.005581257864832878, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 5.505892753601074, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.0140938526019454, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 5.511193752288818, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.017063327133655548, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.403090953826904, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.10899626463651657, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.416842460632324, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.07361466437578201, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 15.878518104553223, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.004751104395836592, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 13.239147186279297, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.10007952898740768, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 9.36572265625, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.05429547280073166, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 16.641582489013672, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.019994335249066353, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 107.41888427734375, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.09431178867816925} -{"step": 146800640, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 79.72244262695312, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.11071459949016571, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 16.070402145385742, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.0051385038532316685, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 6.099991321563721, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.025414323434233665, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 6.193478107452393, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.022922545671463013, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.612677097320557, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.11548683792352676, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.586679935455322, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.12137079983949661, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 16.023895263671875, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.004222680348902941, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 13.595492362976074, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.09106919169425964, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 9.668949127197266, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.10412771999835968, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 15.965679168701172, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.003967868164181709, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 5.532543659210205, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.011267081834375858, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 5.532029151916504, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.012786910869181156, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.403209209442139, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.08823184669017792, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.421188831329346, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.06287765502929688, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 15.873068809509277, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.004660701844841242, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 13.30808162689209, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.10324981808662415, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 9.412495613098145, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.05293235182762146, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 16.764650344848633, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.02192128263413906, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 111.91621398925781, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.07226899266242981} -{"step": 167772160, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 81.11312103271484, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.15527375042438507, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 16.078125, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.005120676942169666, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 6.169868469238281, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.042061757296323776, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 6.2672295570373535, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.03919970616698265, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.639499664306641, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.12624973058700562, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.614434242248535, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.1481141746044159, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 16.024337768554688, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.005756995175033808, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 13.683746337890625, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.1082032173871994, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 9.737713813781738, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.08984449505805969, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 15.96324634552002, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.002943219617009163, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 5.580774307250977, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.013593507930636406, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 5.573145389556885, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.01925147883594036, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.404112815856934, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.07445023953914642, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.427469253540039, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.055045049637556076, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 15.870956420898438, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.0026173419319093227, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 13.386730194091797, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.052688393741846085, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 9.468692779541016, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.035167232155799866, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 16.898025512695312, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.030752452090382576, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 116.54308319091797, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.10764619708061218} -{"step": 188743680, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 82.40393829345703, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.18294812738895416, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 16.088031768798828, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.005838277284055948, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 6.233952045440674, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.04238370805978775, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 6.335366725921631, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.045077286660671234, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.662285327911377, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.142694354057312, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.637637138366699, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.14771829545497894, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 16.021242141723633, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.003689328208565712, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 13.753194808959961, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.09966854751110077, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 9.79158878326416, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.0833396166563034, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 15.961755752563477, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.0020248310174793005, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 5.6178460121154785, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.012656306847929955, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 5.607946395874023, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.01569214090704918, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.407561302185059, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.06148098409175873, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.436041831970215, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.05592111498117447, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 15.869962692260742, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.00333069683983922, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 13.461437225341797, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.07536806911230087, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 9.520127296447754, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.04022948443889618, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 17.03124237060547, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.024885227903723717, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 121.19889068603516, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.07714866101741791} -{"step": 209715200, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 83.5899429321289, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.23450349271297455, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 16.096881866455078, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.009616409428417683, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 6.28783655166626, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.05348244681954384, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 6.391678333282471, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.06199745088815689, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.6802239418029785, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.25547072291374207, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.6560587882995605, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.2666328549385071, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 16.01774787902832, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.0066641164012253284, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 13.814249992370605, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.15700750052928925, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 9.835580825805664, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.13883163034915924, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 15.95808219909668, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.004257276188582182, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 5.643918991088867, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.027993781492114067, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 5.633237361907959, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.03959635645151138, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.409819602966309, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.11706466972827911, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.442257881164551, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.10637278854846954, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 15.873372077941895, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.0070185428485274315, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 13.54623794555664, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.16334129869937897, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 9.573434829711914, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.085782989859581, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 17.16257095336914, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.03155239671468735, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 125.78623962402344, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.14592701196670532} -{"step": 230686720, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 84.69239044189453, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.1594199389219284, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 16.104360580444336, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.006047455593943596, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 6.337196350097656, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.035087380558252335, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 6.442198753356934, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.044717609882354736, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.692870140075684, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.17536436021327972, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.669292449951172, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.19787657260894775, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 16.015531539916992, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.0045129950158298016, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 13.873809814453125, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.11859696358442307, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 9.87357234954834, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.09206336736679077, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 15.953989028930664, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.0030234321020543575, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 5.665635585784912, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.019278470426797867, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 5.654614448547363, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.027543406933546066, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.412269115447998, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.08041753619909286, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.447020053863525, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.07413987070322037, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 15.877509117126465, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.002618588740006089, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 13.628966331481934, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.0659140944480896, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 9.621295928955078, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.041548602283000946, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 17.293792724609375, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.02512042224407196, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 130.28053283691406, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.07979883253574371} -{"step": 251658240, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 85.72723388671875, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.26993507146835327, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 16.113637924194336, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.010271855629980564, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 6.395116806030273, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.07125205546617508, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 6.498762130737305, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.07951226830482483, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.703657627105713, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.264143705368042, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.680201053619385, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.27783527970314026, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 16.01318359375, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.00782782956957817, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 13.927314758300781, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.17763859033584595, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 9.905048370361328, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.14943763613700867, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 15.951674461364746, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.0055868192575871944, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 5.6968584060668945, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.020934268832206726, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 5.682009696960449, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.02988775074481964, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.414920806884766, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.14132586121559143, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.451683521270752, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.10885846614837646, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 15.88360595703125, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.003912162035703659, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 13.711220741271973, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.10075945407152176, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 9.671493530273438, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.06265780329704285, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 17.426355361938477, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.024128690361976624, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 134.6912384033203, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.0995798408985138} -{"step": 272629760, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 86.69818878173828, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.1653515249490738, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 16.12265396118164, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.005828904919326305, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 6.450631618499756, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.03169155493378639, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 6.551648139953613, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.0362689234316349, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.71484899520874, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.17087310552597046, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.691439151763916, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.17820800840854645, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 16.00946044921875, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.0038880317006260157, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 13.971800804138184, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.1117776557803154, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 9.9283447265625, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.10256582498550415, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 15.951170921325684, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.0030098019633442163, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 5.730766296386719, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.014596781693398952, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 5.7123122215271, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.020399650558829308, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.420034408569336, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.09251973778009415, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.4582414627075195, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.07419509440660477, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 15.88968563079834, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.002972376998513937, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 13.790186882019043, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.08366235345602036, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 9.716582298278809, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.056474488228559494, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 17.557889938354492, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.018584130331873894, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 138.9700469970703, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.06315627694129944} -{"step": 293601280, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 87.62620544433594, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.19032599031925201, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 16.13004493713379, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.006730176974087954, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 6.506270885467529, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.043432578444480896, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 6.604267597198486, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.04684501141309738, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.720642566680908, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.17181695997714996, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.698472499847412, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.18875055015087128, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 16.006467819213867, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.0047960164956748486, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 14.012567520141602, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.11575763672590256, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 9.949236869812012, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.11300285905599594, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 15.953019142150879, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.004510291386395693, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 5.767028331756592, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.023713236674666405, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 5.744261264801025, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.04127831384539604, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.427191734313965, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.12679380178451538, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.466919898986816, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.08619166165590286, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 15.89726734161377, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.00509621761739254, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 13.870442390441895, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.12431024014949799, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 9.761052131652832, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.07181130349636078, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 17.691904067993164, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.016062138602137566, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 143.19581604003906, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.07588590681552887} -{"step": 314572800, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 88.51741790771484, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.15481190383434296, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 16.136295318603516, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.006256000604480505, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 6.561213970184326, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.02961118333041668, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 6.653446197509766, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.03812626749277115, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.722362041473389, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.17562948167324066, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.702115058898926, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.18268033862113953, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 16.002885818481445, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.003687445539981127, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 14.048123359680176, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.11005576699972153, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 9.966753005981445, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.10191681236028671, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 15.954724311828613, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.003967373166233301, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 5.80466365814209, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.017651716247200966, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 5.776679039001465, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.030177999287843704, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.432142734527588, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.11184961348772049, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.473326683044434, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.07832980901002884, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 15.907949447631836, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.002977752359583974, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 13.954350471496582, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.08299226313829422, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 9.809192657470703, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.05807905271649361, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 17.82780647277832, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.019882671535015106, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 147.3437042236328, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.054616887122392654} -{"step": 335544320, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 89.37042999267578, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.18571698665618896, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 16.143674850463867, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.008404730819165707, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 6.61981725692749, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.03820309415459633, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 6.706546783447266, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.04577510431408882, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.72298002243042, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.2643967270851135, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.7042436599731445, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.26484015583992004, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 16.0013427734375, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.008039049804210663, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 14.085330963134766, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.1646706461906433, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 9.985941886901855, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.14334464073181152, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 15.956422805786133, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.004724917002022266, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 5.847090721130371, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.021288853138685226, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 5.812254428863525, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.025231197476387024, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.435060024261475, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.15742512047290802, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.475981712341309, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.11978232115507126, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 15.918900489807129, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.007970517501235008, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 14.034469604492188, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.17999860644340515, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 9.853618621826172, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.11869597434997559, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 17.960939407348633, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.02689162828028202, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 151.37074279785156, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.10738975554704666} -{"step": 356515840, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 90.18193054199219, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.1868646740913391, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 16.151386260986328, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.006104396656155586, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 6.674643039703369, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.048540469259023666, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 6.754110813140869, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.061270155012607574, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.724301815032959, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.15949419140815735, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.7067437171936035, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.17312105000019073, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 15.998411178588867, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.0035280922893434763, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 14.116508483886719, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.10710175335407257, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 10.000835418701172, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.08952316641807556, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 15.960131645202637, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.0031623966060578823, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 5.89450740814209, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.01689925044775009, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 5.852238178253174, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.024122020229697227, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.439153671264648, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.10176601260900497, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.480656147003174, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.07919174432754517, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 15.926070213317871, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.003904904704540968, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 14.101515769958496, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.10428297519683838, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 9.887127876281738, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.06279106438159943, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 18.09077262878418, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.022702757269144058, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 155.23870849609375, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.07147815823554993} -{"step": 377487360, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 90.95699310302734, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.2212103307247162, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 16.15605354309082, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.007331445813179016, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 6.726557731628418, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.04164022207260132, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 6.798417568206787, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.053783904761075974, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.72062349319458, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.2253408581018448, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.704331398010254, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.23163330554962158, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 15.994836807250977, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.0050890338607132435, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 14.143105506896973, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.14849095046520233, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 10.012619018554688, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.1257672905921936, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 15.966256141662598, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.004770119674503803, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 5.952005863189697, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.032049957662820816, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 5.899295806884766, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.051186174154281616, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.4425272941589355, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.14116579294204712, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.485681533813477, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.08981287479400635, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 15.937154769897461, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.005214070435613394, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 14.172094345092773, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.12855303287506104, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 9.924602508544922, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.08389309793710709, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 18.223482131958008, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.016263682395219803, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 158.96397399902344, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.10007724910974503} -{"step": 398458880, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 91.6969985961914, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.20225776731967926, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 16.163387298583984, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.00781931821256876, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 6.78471040725708, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.04324169456958771, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 6.848275661468506, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.05311664193868637, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.718277931213379, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.24191386997699738, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.702600002288818, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.2488475739955902, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 15.99167537689209, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.005684761330485344, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 14.169295310974121, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.15688541531562805, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 10.02481746673584, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.129573792219162, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 15.97095012664795, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.004289055708795786, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 6.004090785980225, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.020680347457528114, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 5.940299987792969, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.02837490104138851, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.444690704345703, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.14113540947437286, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.488828659057617, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.09520675987005234, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 15.94697093963623, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.0033940779976546764, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 14.23332405090332, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.10022522509098053, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 9.95811939239502, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.06744663417339325, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 18.35507583618164, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.016552511602640152, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 162.53488159179688, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.05624901130795479} -{"step": 419430400, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 92.3997573852539, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.16825619339942932, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 16.168781280517578, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.006719739641994238, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 6.836230754852295, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.036598291248083115, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 6.889954090118408, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.04591874033212662, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.715531349182129, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.19733740389347076, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.700194835662842, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.19533468782901764, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 15.98763370513916, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.003975323401391506, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 14.190682411193848, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.12504205107688904, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 10.03432846069336, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.10599973797798157, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 15.976816177368164, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.003285046899691224, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 6.0607428550720215, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.020563578233122826, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 5.984251976013184, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.03046334534883499, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.4473557472229, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.1155785471200943, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.491968154907227, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.0797729566693306, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 15.95754623413086, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.003934280946850777, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 14.292252540588379, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.09877844154834747, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 9.989632606506348, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.06968385726213455, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 18.485919952392578, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.016047541052103043, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 165.93624877929688, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.06425681710243225} -{"step": 440401920, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 93.06876373291016, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.16101649403572083, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 16.17323875427246, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.005690295249223709, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 6.8876051902771, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.03773806244134903, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 6.931217670440674, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.041995882987976074, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.710573673248291, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.18496981263160706, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.696066856384277, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.20168524980545044, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 15.984602928161621, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.004414682742208242, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 14.211355209350586, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.13035885989665985, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 10.043425559997559, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.11019724607467651, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 15.983698844909668, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.004545003641396761, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 6.118174076080322, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.022996097803115845, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 6.0289306640625, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.03314101696014404, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.450057029724121, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.1542944759130478, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.4955644607543945, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.09563581645488739, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 15.968123435974121, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.004442555829882622, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 14.350325584411621, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.12253982573747635, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 10.0202054977417, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.08658337593078613, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 18.614974975585938, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.020074041560292244, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 169.1759033203125, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.06428783386945724} -{"step": 461373440, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 93.70811462402344, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.1794285923242569, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 16.17734718322754, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.0071310377679765224, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 6.937227725982666, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.04897375404834747, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 6.970517158508301, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.060623858124017715, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.704633712768555, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.19960281252861023, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.69101619720459, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.20415504276752472, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 15.981446266174316, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.004701381549239159, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 14.2301664352417, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.14131483435630798, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 10.05202579498291, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.11529140174388885, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 15.990428924560547, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.0034155852627009153, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 6.1749138832092285, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.02296951599419117, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 6.072824478149414, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.03465968742966652, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.451198101043701, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.14055229723453522, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.49703311920166, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.0879296213388443, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 15.978401184082031, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.004313148092478514, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 14.403863906860352, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.11268224567174911, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 10.049324035644531, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.07198936492204666, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 18.742774963378906, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.015268527902662754, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 172.25390625, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.05970705300569534} -{"step": 482344960, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 94.3187255859375, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.21169902384281158, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 16.181716918945312, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.006180800963193178, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 6.988091945648193, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.05504817143082619, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 7.011715412139893, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.06445149332284927, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.6992034912109375, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.18469659984111786, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.685945510864258, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.1906316727399826, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 15.977374076843262, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.0039035018999129534, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 14.246688842773438, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.12356648594141006, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 10.05979061126709, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.10199974477291107, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 15.999103546142578, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.0038018152117729187, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 6.239927768707275, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.02624892257153988, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 6.121393203735352, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.041276805102825165, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.453379154205322, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.13942137360572815, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.5005269050598145, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.08098916709423065, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 15.987762451171875, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.0033703383523970842, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 14.453649520874023, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.0983172208070755, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 10.075858116149902, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.06883500516414642, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 18.868789672851562, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.012616008520126343, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 175.19883728027344, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.05638314411044121} -{"step": 503316480, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 94.90225219726562, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.20168597996234894, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 16.185617446899414, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.007472706958651543, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 7.036789894104004, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.0399039164185524, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 7.051102161407471, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.04867768660187721, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.693480491638184, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.23304533958435059, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.680662155151367, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.24252746999263763, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 15.972967147827148, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.005349661223590374, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 14.261443138122559, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.15729621052742004, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 10.066620826721191, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.13165083527565002, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 16.005956649780273, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.0045263259671628475, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 6.300317287445068, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.026664594188332558, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 6.165949821472168, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.04383569583296776, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.453362464904785, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.16962341964244843, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.501311302185059, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.10213956236839294, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 15.997475624084473, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.005349584389477968, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 14.500913619995117, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.1588604599237442, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 10.102021217346191, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.11202198266983032, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 18.994653701782227, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.01583031937479973, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 177.99876403808594, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.09951364248991013} -{"step": 524288000, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 95.4607162475586, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.173790842294693, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 16.189699172973633, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.006461607292294502, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 7.081820487976074, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.03183655068278313, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 7.086215972900391, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.044682763516902924, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.688521862030029, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.2197323590517044, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.676581859588623, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.2192879170179367, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 15.969632148742676, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.00468054972589016, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 14.276426315307617, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.1481378674507141, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 10.07353687286377, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.12318132817745209, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 16.01369285583496, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.00422373553737998, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 6.358630657196045, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.02568681910634041, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 6.208092212677002, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.0382656455039978, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.454948902130127, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.149294912815094, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.503614902496338, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.09733951836824417, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 16.003236770629883, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.004746751394122839, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 14.539839744567871, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.126820370554924, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 10.120954513549805, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.08479353785514832, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 19.11484146118164, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.011526040732860565, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 180.6418914794922, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.061778075993061066} -{"step": 545259520, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 95.99492645263672, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.13680732250213623, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 16.192962646484375, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.005379419308155775, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 7.129014492034912, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.02403535321354866, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 7.123932838439941, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.028990743681788445, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.681814670562744, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.19343781471252441, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.670223712921143, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.198636993765831, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 15.964644432067871, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.0040532061830163, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 14.287606239318848, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.12838520109653473, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 10.078822135925293, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.1081036776304245, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 16.023624420166016, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.004271000158041716, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 6.426203727722168, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.03023330681025982, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 6.256256103515625, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.045903850346803665, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.456074237823486, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.1521119475364685, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.505194664001465, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.08885842561721802, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 16.010976791381836, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.003902504686266184, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 14.580229759216309, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.10884825140237808, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 10.141508102416992, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.07565822452306747, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 19.235614776611328, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.012253240682184696, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 183.1404571533203, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.053283922374248505} -{"step": 566231040, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 96.50667572021484, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.23378127813339233, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 16.196094512939453, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.009351613000035286, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 7.174658298492432, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.04569125175476074, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 7.159543514251709, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.0643823891878128, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.6755218505859375, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.3140159845352173, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.664389133453369, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.31111767888069153, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 15.960845947265625, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.007765975780785084, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 14.300117492675781, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.20980793237686157, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 10.084969520568848, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.16569465398788452, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 16.033172607421875, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.0059326994232833385, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 6.49534797668457, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.032422468066215515, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 6.3056793212890625, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.05202467367053032, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.45633602142334, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.2144993543624878, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.506428241729736, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.12484081089496613, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 16.017059326171875, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.005022590979933739, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 14.615368843078613, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.13315874338150024, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 10.160242080688477, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.09044206887483597, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 19.355682373046875, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.013279551640152931, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 185.52310180664062, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.08145350217819214} -{"step": 587202560, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 96.9976577758789, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.19704098999500275, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 16.19970703125, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.006702489219605923, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 7.218095779418945, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.03847731277346611, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 7.193765163421631, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.04554888233542442, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.6703104972839355, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.22235144674777985, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.6589674949646, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.2378266304731369, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 15.956890106201172, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.005131088197231293, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 14.311867713928223, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.1620876044034958, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 10.089766502380371, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.132193461060524, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 16.042497634887695, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.00576296029612422, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 6.562889099121094, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.04781070351600647, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 6.353408336639404, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.07024078071117401, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.456641674041748, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.19096830487251282, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.506920337677002, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.11110451817512512, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 16.02153778076172, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.004734706599265337, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 14.646529197692871, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.13663055002689362, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 10.176133155822754, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.09546761959791183, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 19.47359848022461, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.009165543131530285, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 187.78909301757812, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.05805690586566925} -{"step": 608174080, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 97.46521759033203, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.23000700771808624, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 16.200719833374023, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.006650399416685104, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 7.256455421447754, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.05184350162744522, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 7.22279167175293, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.06993437558412552, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.663182258605957, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.22594201564788818, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.651796340942383, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.2354612499475479, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 15.951196670532227, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.00608876021578908, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 14.320408821105957, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.1567189246416092, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 10.093465805053711, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.12870967388153076, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 16.052560806274414, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.005376426503062248, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 6.632411003112793, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.029952114447951317, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 6.4015679359436035, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.04771397262811661, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.456742286682129, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.18429037928581238, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.50820255279541, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.10269364714622498, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 16.0272159576416, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.005556750576943159, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 14.6781587600708, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.1560232788324356, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 10.194511413574219, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.11243624985218048, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 19.59172248840332, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.016204146668314934, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 189.93479919433594, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.06911925226449966} -{"step": 629145600, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 97.91213989257812, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.1877346783876419, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 16.202484130859375, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.007119736634194851, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 7.2938551902771, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.035650089383125305, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 7.2523369789123535, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.04616270214319229, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.656511306762695, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.23632484674453735, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.645331382751465, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.21928878128528595, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 15.947052955627441, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.005027793813496828, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 14.32983112335205, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.14820709824562073, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 10.098029136657715, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.11929341405630112, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 16.064077377319336, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.004373842850327492, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 6.70487117767334, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.02102607861161232, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 6.450196266174316, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.030719123780727386, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.4578447341918945, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.14709030091762543, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.511240005493164, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.0925687924027443, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 16.03050422668457, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.003780415980145335, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 14.705229759216309, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.10846652090549469, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 10.208090782165527, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.07270412147045135, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 19.705883026123047, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.012091008946299553, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 191.96560668945312, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.048002708703279495} -{"step": 650117120, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 98.33966064453125, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.1857294738292694, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 16.203815460205078, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.005344967823475599, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 7.329800605773926, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.035686641931533813, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 7.280463218688965, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.04633520543575287, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.649130344390869, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.1979013979434967, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.637845039367676, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.19949179887771606, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 15.942092895507812, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.003956171218305826, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 14.337104797363281, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.13692490756511688, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 10.101163864135742, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.10911709815263748, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 16.077116012573242, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.004285199102014303, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 6.783267974853516, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.02875271439552307, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 6.50286340713501, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.04584014415740967, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.458562850952148, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.14482267200946808, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.513274192810059, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.09168720990419388, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 16.03411102294922, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.0038213063962757587, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 14.730607032775879, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.11479402333498001, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 10.221600532531738, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.07988661527633667, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 19.818281173706055, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.010998686775565147, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 193.88584899902344, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.05688533931970596} -{"step": 671088640, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 98.74923706054688, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.16218948364257812, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 16.20427894592285, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.0068692476488649845, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 7.366020679473877, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.033498745411634445, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 7.308741092681885, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.039547182619571686, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.6412763595581055, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.23199741542339325, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.629801273345947, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.22528484463691711, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 15.936982154846191, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.004499128554016352, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 14.344210624694824, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.14278548955917358, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 10.104670524597168, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.1114334836602211, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 16.091079711914062, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.0036029431503266096, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 6.863310813903809, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.021330127492547035, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 6.556469917297363, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.03674659878015518, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.460000991821289, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.14309996366500854, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.5161967277526855, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.0894298106431961, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 16.03721046447754, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.00434810109436512, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 14.754014015197754, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.11731911450624466, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 10.233867645263672, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.08280222117900848, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 19.929668426513672, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.013157490640878677, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 195.70504760742188, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.05082424730062485} -{"step": 692060160, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 99.14143371582031, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.1697525978088379, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 16.203472137451172, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.006656658835709095, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 7.3969035148620605, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.03752152994275093, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 7.3324294090271, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.044837675988674164, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.632474422454834, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.21958225965499878, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.620667457580566, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.21898137032985687, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 15.932332038879395, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.004386397544294596, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 14.351079940795898, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.15190647542476654, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 10.10773754119873, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.11782748252153397, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 16.105735778808594, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.004547025542706251, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 6.946802616119385, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.018180644139647484, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 6.611871719360352, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.02950242906808853, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.460139274597168, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.16226480901241302, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.517846584320068, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.09543152898550034, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 16.039621353149414, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.005448665004223585, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 14.774356842041016, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.15050867199897766, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 10.245049476623535, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.09957064688205719, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 20.0396671295166, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.015251975506544113, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 197.43446350097656, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.06484118849039078} -{"step": 713031680, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 99.51653289794922, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.15328705310821533, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 16.202756881713867, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.006157617550343275, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 7.4262847900390625, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.03355354815721512, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 7.35516357421875, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.04911372438073158, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.624655246734619, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.21236224472522736, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.612453460693359, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.21579360961914062, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 15.927457809448242, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.004767550155520439, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 14.357067108154297, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.15446707606315613, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 10.110474586486816, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.12346304208040237, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 16.12030029296875, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.004350739996880293, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 7.02753210067749, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.023869099095463753, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 6.665279865264893, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.0377645306289196, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.460972309112549, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.14792676270008087, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.520140647888184, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.09828280657529831, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 16.041025161743164, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.004291802644729614, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 14.79238224029541, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.12062748521566391, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 10.253875732421875, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.07870635390281677, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 20.144540786743164, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.011557107791304588, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 199.07469177246094, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.05021332949399948} -{"step": 734003200, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 99.87357330322266, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.17812049388885498, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 16.201627731323242, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.005807207431644201, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 7.455819129943848, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.03801953047513962, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 7.378324508666992, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.044450197368860245, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.616361618041992, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.2056865096092224, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.6037492752075195, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.20680706202983856, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 15.922426223754883, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.004315161146223545, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 14.361764907836914, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.1455824077129364, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 10.112001419067383, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.11468826979398727, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 16.135826110839844, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.004174543544650078, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 7.111320495605469, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.033818162977695465, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 6.720723628997803, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.05267367139458656, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.46146297454834, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.14866898953914642, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.522271633148193, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.09485161304473877, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 16.042627334594727, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.00367988389916718, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 14.809333801269531, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.109723299741745, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 10.26322078704834, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.07214868813753128, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 20.249467849731445, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.011573177762329578, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 200.633544921875, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.04708780348300934} -{"step": 754974720, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 100.2135238647461, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.19538576900959015, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 16.200401306152344, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.007158160675317049, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 7.486011981964111, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.04070689529180527, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 7.4025654792785645, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.05054287612438202, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.6077094078063965, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.245735764503479, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.594586372375488, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.2674366235733032, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 15.917159080505371, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.006415775511413813, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 14.365885734558105, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.18919974565505981, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 10.113799095153809, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.14924563467502594, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 16.151416778564453, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.006528491619974375, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 7.195105075836182, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.028338506817817688, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 6.776177883148193, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.052007224410772324, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.461961269378662, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.22109131515026093, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.52431058883667, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.1255531758069992, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 16.043533325195312, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.008186180144548416, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 14.824210166931152, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.21653388440608978, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 10.270929336547852, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.1540701687335968, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 20.35308074951172, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.007346997503191233, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 202.11326599121094, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.08226938545703888} -{"step": 775946240, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 100.53853607177734, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.144850954413414, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 16.197872161865234, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.0057731932029128075, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 7.511653900146484, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.02707335166633129, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 7.421937942504883, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.03787268325686455, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.599267482757568, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.20508301258087158, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.585642337799072, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.21181052923202515, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 15.912480354309082, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.004238928202539682, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 14.369729042053223, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.14568853378295898, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 10.115158081054688, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.11524937301874161, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 16.167715072631836, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.004174579866230488, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 7.280251979827881, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.019060852006077766, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 6.832076072692871, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.03194175288081169, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.462797164916992, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.1615314781665802, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.526727676391602, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.09309394657611847, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 16.04373550415039, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.004523435607552528, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 14.836649894714355, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.11848334223031998, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 10.27730655670166, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.08250332623720169, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 20.452604293823242, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.01040741428732872, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 203.51556396484375, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.04736665263772011} -{"step": 796917760, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 100.84974670410156, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.15987160801887512, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 16.196134567260742, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.006110765039920807, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 7.538494110107422, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.03204868733882904, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 7.442637920379639, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.04139488935470581, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.59025764465332, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.21023522317409515, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.575987815856934, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.21718734502792358, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 15.906937599182129, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.004418785683810711, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 14.371858596801758, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.1545242965221405, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 10.115864753723145, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.11957500129938126, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 16.184797286987305, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.004360560793429613, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 7.370089530944824, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.025546260178089142, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 6.89139461517334, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.0408049002289772, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.463657855987549, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.15638785064220428, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.528619766235352, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.10145933926105499, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 16.044607162475586, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.004564089234918356, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 14.850147247314453, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.12920714914798737, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 10.284090042114258, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.09272819012403488, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 20.551836013793945, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.008810059167444706, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 204.85951232910156, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.06477934867143631} -{"step": 817889280, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 101.14578247070312, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.20748692750930786, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 16.193164825439453, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.007225861307233572, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 7.563281536102295, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.036729563027620316, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 7.461469650268555, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.04886237531900406, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.581043243408203, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.27398499846458435, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.566330909729004, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.2818998396396637, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 15.901969909667969, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.005972034297883511, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 14.375466346740723, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.19696171581745148, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 10.117362022399902, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.15048620104789734, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 16.20224952697754, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.006323020439594984, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 7.459164142608643, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.026077941060066223, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 6.950024127960205, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.04567622020840645, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.464397430419922, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.22344158589839935, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.530705451965332, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.11702536791563034, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 16.044790267944336, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.004829795565456152, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 14.861865043640137, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.14145652949810028, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 10.289941787719727, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.10114939510822296, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 20.648263931274414, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.013042164035141468, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 206.13720703125, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.07256011664867401} -{"step": 838860800, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 101.42642211914062, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.1797904521226883, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 16.189807891845703, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.006926069036126137, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 7.5834856033325195, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.04278109595179558, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 7.476991176605225, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.05278534069657326, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.57284688949585, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.23115511238574982, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.557408332824707, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.2387111932039261, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 15.896489143371582, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.004897730890661478, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 14.37728500366211, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.16851839423179626, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 10.118020057678223, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.1314166933298111, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 16.219648361206055, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.005651235580444336, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 7.546398162841797, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.031406719237565994, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 7.006115913391113, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.05343165993690491, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.465780258178711, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.19817319512367249, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.533334255218506, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.11363182216882706, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 16.044313430786133, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.004159240983426571, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 14.871569633483887, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.1294303983449936, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 10.294076919555664, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.08826544135808945, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 20.740455627441406, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.009238067083060741, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 207.34634399414062, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.05352908372879028} -{"step": 859832320, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 101.6934814453125, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.19547200202941895, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 16.18631935119629, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.0063984813168644905, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 7.6052350997924805, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.04210828244686127, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 7.493181228637695, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.049185674637556076, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.563807964324951, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.24700024724006653, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.547609329223633, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.26667433977127075, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 15.891427040100098, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.007100434973835945, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 14.37916374206543, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.1870681196451187, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 10.11878490447998, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.14838212728500366, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 16.236921310424805, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.00679667666554451, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 7.63273286819458, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.032986924052238464, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 7.061482906341553, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.0655113235116005, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.466602325439453, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.23418666422367096, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.535277366638184, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.11491476744413376, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 16.042951583862305, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.006458135321736336, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 14.878159523010254, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.17820964753627777, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 10.298015594482422, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.12402451783418655, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 20.833019256591797, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.01342862006276846, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 208.4987030029297, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.08613552898168564} -{"step": 880803840, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 101.94660949707031, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.15427939593791962, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 16.18147850036621, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.005688059609383345, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 7.62285852432251, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.03255264833569527, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 7.506218910217285, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.03633489832282066, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.554816722869873, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.21213600039482117, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.537661552429199, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.22230809926986694, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 15.886260986328125, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.004836809355765581, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 14.38060474395752, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.15521806478500366, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 10.119197845458984, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.12282989919185638, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 16.2548828125, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.0046623749658465385, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 7.72130823135376, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.024432092905044556, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 7.118424892425537, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.041563667356967926, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.467344760894775, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.1657496988773346, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.537866115570068, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.09936700761318207, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 16.042123794555664, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.004947331268340349, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 14.88610553741455, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.12552934885025024, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 10.301860809326172, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.09627386927604675, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 20.923263549804688, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.008289842866361141, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 209.59951782226562, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.04515163227915764} -{"step": 901775360, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 102.18592071533203, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.16381607949733734, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 16.175661087036133, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.006330007221549749, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 7.638778209686279, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.044374994933605194, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 7.5177412033081055, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.0541272833943367, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.545689105987549, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.2200678437948227, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.527730464935303, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.24133236706256866, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 15.880728721618652, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.0050013246946036816, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 14.38066577911377, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.1795811653137207, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 10.118865966796875, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.14239515364170074, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 16.273469924926758, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.005588987842202187, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 7.810616493225098, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.030622320249676704, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 7.175553798675537, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.053170476108789444, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.467921733856201, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.20546723902225494, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.540738582611084, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.11704584211111069, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 16.040864944458008, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.006939925253391266, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 14.892014503479004, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.18837907910346985, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 10.305188179016113, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.13440310955047607, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 21.012432098388672, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.012835093773901463, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 210.6397705078125, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.06251958012580872} -{"step": 922746880, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 102.41421508789062, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.17331846058368683, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 16.170997619628906, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.005874899215996265, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 7.65885591506958, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.03366272523999214, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 7.532191276550293, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.0409543402493, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.53681755065918, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.2194737195968628, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.518121719360352, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.23309795558452606, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 15.8751859664917, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.00453218212351203, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 14.380939483642578, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.1650215983390808, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 10.118496894836426, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.12143053859472275, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 16.292152404785156, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.004373306408524513, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 7.9001665115356445, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.023508355021476746, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 7.232931137084961, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.03883859142661095, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.4684319496154785, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.17747321724891663, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.5428924560546875, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.09738397598266602, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 16.03941535949707, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.004093871917575598, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 14.89767837524414, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.1172710582613945, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 10.307811737060547, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.08505931496620178, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 21.099214553833008, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.007570852525532246, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 211.63941955566406, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.04027943313121796} -{"step": 943718400, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 102.63068389892578, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.19332842528820038, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 16.16642189025879, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.007029644213616848, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 7.6767802238464355, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.051006752997636795, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 7.545461177825928, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.05767703428864479, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.527604103088379, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.25597241520881653, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.508379936218262, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.28750523924827576, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 15.870383262634277, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.0069329263642430305, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 14.381551742553711, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.22030940651893616, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 10.118501663208008, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.16824971139431, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 16.309865951538086, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.006738330237567425, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 7.986880779266357, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.03455904871225357, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 7.288208484649658, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.06802672892808914, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.468595504760742, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.2709118723869324, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.544408321380615, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.1369522660970688, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 16.037506103515625, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.008555705659091473, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 14.901494026184082, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.22616758942604065, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 10.309152603149414, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.15548495948314667, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 21.182476043701172, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.014171818271279335, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 212.583251953125, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.1069059818983078} -{"step": 964689920, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 102.83692932128906, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.15864884853363037, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 16.161624908447266, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.005082873161882162, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 7.692153453826904, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.042254552245140076, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 7.55643892288208, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.0513150691986084, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.519028663635254, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.19304007291793823, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.4989519119262695, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.1997712403535843, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 15.865564346313477, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.0038684988394379616, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 14.381552696228027, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.13281206786632538, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 10.118124961853027, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.10339333862066269, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 16.326730728149414, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.0038103272672742605, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 8.072552680969238, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.01684216968715191, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 7.342890739440918, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.030429771170020103, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.468432903289795, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.1301405429840088, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.54496431350708, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.08455494046211243, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 16.035358428955078, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.003076777793467045, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 14.904984474182129, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.09311511367559433, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 10.31006908416748, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.07218984514474869, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 21.26221466064453, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.008568566292524338, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 213.4856414794922, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.03786230832338333} -{"step": 985661440, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 103.0288314819336, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.13457661867141724, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 16.15479278564453, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.004987509921193123, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 7.705075740814209, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.03254183009266853, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 7.565519332885742, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.04020589962601662, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.5091423988342285, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.18583644926548004, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.488162517547607, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.20179277658462524, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 15.859256744384766, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.0044565992429852486, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 14.378739356994629, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.15202198922634125, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 10.116286277770996, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.12032986432313919, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 16.346267700195312, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.0045106252655386925, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 8.165044784545898, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.017412614077329636, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 7.4019904136657715, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.03276439383625984, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.469353675842285, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.16681380569934845, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.547357082366943, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.0996970534324646, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 16.03434181213379, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.005002281162887812, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 14.91014289855957, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.13999798893928528, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 10.312479972839355, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.12078322470188141, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 21.342222213745117, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.006934290751814842, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 214.34400939941406, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.05012379214167595} -{"step": 1006632960, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 103.20957946777344, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.1490229368209839, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 16.147598266601562, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.00597240449860692, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 7.716988563537598, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.02921243943274021, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 7.574188232421875, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.034072231501340866, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.4997382164001465, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.23419208824634552, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.477906227111816, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.25570976734161377, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 15.854101181030273, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.005514075513929129, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 14.37734317779541, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.1962435394525528, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 10.115495681762695, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.13496555387973785, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 16.3657169342041, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.005096361972391605, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 8.2533540725708, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.018963254988193512, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 7.457682132720947, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.035734884440898895, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.469965934753418, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.1906980723142624, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.549586296081543, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.10302436351776123, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 16.032556533813477, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.003473007818683982, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 14.912769317626953, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.1198413223028183, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 10.313557624816895, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.07839397341012955, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 21.419357299804688, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.009197474457323551, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 215.1563262939453, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.04414010047912598} -{"step": 1027604480, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 103.37959289550781, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.12594154477119446, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 16.14113998413086, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.0042606256902217865, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 7.728307723999023, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.03308701142668724, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 7.582416534423828, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.038825616240501404, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.490254878997803, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.16582883894443512, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.467419624328613, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.1762402355670929, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 15.849410057067871, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.00350739574059844, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 14.376051902770996, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.13130412995815277, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 10.114632606506348, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.10103248804807663, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 16.38570785522461, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.004124090541154146, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 8.344433784484863, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.018137233331799507, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 7.514616012573242, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.03249914571642876, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.470733642578125, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.1351565718650818, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.551869869232178, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.08347727358341217, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 16.029827117919922, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.003377941669896245, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 14.913269996643066, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.09027375280857086, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 10.313626289367676, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.06531857699155807, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 21.494762420654297, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.008954312652349472, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 215.93356323242188, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.03761247172951698} -{"step": 1048576000, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 103.540283203125, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.1554422378540039, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 16.133525848388672, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.005599639378488064, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 7.736498832702637, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.03872879222035408, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 7.58811092376709, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.04612375795841217, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.4805073738098145, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.22648663818836212, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.457031726837158, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.24748684465885162, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 15.84533977508545, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.005547760985791683, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 14.37535572052002, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.18414896726608276, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 10.114052772521973, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.1376493275165558, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 16.40499496459961, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.005340976174920797, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 8.431375503540039, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.025825342163443565, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 7.56900691986084, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.05353619530797005, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.471002101898193, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.1952422708272934, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.553330898284912, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.10081177204847336, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 16.027183532714844, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.004527051467448473, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 14.913535118103027, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.12835077941417694, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 10.313820838928223, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.0904165655374527, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 21.568185806274414, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.010303976014256477, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 216.67718505859375, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.06009377911686897} -{"step": 1069547520, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 103.69100189208984, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.1356734186410904, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 16.126863479614258, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.005613479297608137, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 7.747913837432861, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.029310276731848717, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 7.5959906578063965, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.03213926777243614, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.471582412719727, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.20073801279067993, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.447401523590088, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.22079533338546753, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 15.840250015258789, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.004230023827403784, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 14.372828483581543, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.1582774817943573, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 10.112629890441895, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.12311100959777832, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 16.42438507080078, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.004966586362570524, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 8.518303871154785, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.019256580621004105, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 7.6232194900512695, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.03421541303396225, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.471554756164551, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.16211962699890137, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.5555925369262695, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.0945245698094368, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 16.02454948425293, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.004538564942777157, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 14.913749694824219, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.12086865305900574, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 10.313860893249512, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.09954286366701126, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 21.639610290527344, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.008800318464636803, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 217.38540649414062, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.05331788212060928} -{"step": 1090519040, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 103.83125305175781, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.1704653799533844, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 16.11827850341797, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.006227764766663313, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 7.755045413970947, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.03947070613503456, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 7.600195407867432, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.044581662863492966, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.462284564971924, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.24254752695560455, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.437187194824219, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.2479977309703827, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 15.835807800292969, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.00499895540997386, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 14.371243476867676, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.1775313913822174, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 10.111761093139648, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.13727889955043793, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 16.443817138671875, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.005538079887628555, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 8.605257034301758, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.02479366399347782, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 7.677193641662598, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.05039631202816963, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.472074508666992, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.20519307255744934, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.557654857635498, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.10418447852134705, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 16.022167205810547, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.004916106816381216, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 14.914484024047852, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.14716501533985138, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 10.313920021057129, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.11163799464702606, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 21.708913803100586, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.007533935829997063, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 218.05715942382812, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.06425929814577103} -{"step": 1111490560, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 103.9634780883789, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.14886105060577393, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 16.110403060913086, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.005304038990288973, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 7.761297225952148, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.03431788459420204, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 7.603919506072998, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.04061499983072281, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.4530205726623535, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.20987193286418915, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.426817893981934, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.2240525484085083, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 15.830805778503418, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.004843658767640591, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 14.368330001831055, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.1617969572544098, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 10.110124588012695, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.12292498350143433, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 16.463483810424805, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.004690357018262148, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 8.692914009094238, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.01989036798477173, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 7.731282711029053, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.038038380444049835, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.472564697265625, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.18987703323364258, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.559544563293457, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.09913789480924606, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 16.02010154724121, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.0038231804501265287, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 14.914925575256348, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.11640554666519165, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 10.313960075378418, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.0827770084142685, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 21.775869369506836, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.008403286337852478, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 218.69900512695312, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.04715766757726669} -{"step": 1132462080, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 104.08735656738281, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.15879559516906738, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 16.101619720458984, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.005831586662679911, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 7.766286849975586, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.02832208015024662, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 7.606490612030029, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.03602365404367447, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.443396091461182, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.2381865531206131, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.416596412658691, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.2677778899669647, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 15.826689720153809, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.005498591810464859, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 14.366458892822266, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.19254378974437714, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 10.109217643737793, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.1503160148859024, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 16.483736038208008, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.0062768785282969475, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 8.781135559082031, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.02540280856192112, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 7.785369873046875, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.05200028046965599, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.473453044891357, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.2339506447315216, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.5616865158081055, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.11127924174070358, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 16.017230987548828, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.005591806955635548, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 14.913752555847168, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.15673360228538513, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 10.312861442565918, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.12113098800182343, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 21.84083366394043, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.006888995878398418, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 219.31289672851562, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.0654134750366211} -{"step": 1153433600, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 104.201904296875, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.12712831795215607, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 16.094104766845703, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.004738877527415752, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 7.7753071784973145, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.031123086810112, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 7.612671852111816, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.03754653409123421, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.434205055236816, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.18674027919769287, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.406477928161621, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.21427540481090546, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 15.822100639343262, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.004419047851115465, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 14.363740921020508, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.16885243356227875, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 10.107759475708008, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.12637266516685486, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 16.504697799682617, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.004827891476452351, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 8.871052742004395, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.0173957459628582, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 7.8409953117370605, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.034337759017944336, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.474287509918213, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.17695379257202148, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.563831329345703, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.0970342755317688, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 16.0146484375, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.004513082560151815, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 14.912551879882812, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.1315322369337082, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 10.311909675598145, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.0997258797287941, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 21.90411949157715, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.007418267894536257, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 219.90103149414062, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.04590211436152458} -{"step": 1174405120, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 104.30814361572266, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.14709466695785522, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 16.085506439208984, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.005755849182605743, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 7.780262470245361, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.03406824916601181, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 7.615466594696045, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.045853715389966965, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.424717426300049, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.21159310638904572, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.396132469177246, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.24141861498355865, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 15.817397117614746, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.006159309763461351, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 14.360695838928223, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.185804083943367, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 10.1058931350708, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.13654686510562897, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 16.525409698486328, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.0056136068888008595, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 8.958732604980469, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.02462008409202099, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 7.894466400146484, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.04830045998096466, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.474959373474121, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.1974792778491974, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.565835952758789, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.10884343832731247, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 16.012109756469727, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.0037938158493489027, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 14.910967826843262, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.11997632682323456, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 10.310986518859863, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.09125994145870209, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 21.965755462646484, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.009624316357076168, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 220.45703125, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.051240768283605576} -{"step": 1195376640, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 104.40540313720703, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.12269214540719986, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 16.076370239257812, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.004410247318446636, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 7.784846782684326, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.039681706577539444, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 7.617709159851074, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.04406728968024254, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.415170192718506, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.16237682104110718, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.385890483856201, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.18617530167102814, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 15.81285285949707, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.0037367818877100945, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 14.357145309448242, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.14149242639541626, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 10.104029655456543, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.10739397257566452, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 16.546794891357422, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.00442234193906188, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 9.047123908996582, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.015434657223522663, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 7.948825359344482, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.02891438826918602, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.475841045379639, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.12800300121307373, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.568531513214111, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.08619596064090729, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 16.009780883789062, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.002973484108224511, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 14.9099760055542, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.09004441648721695, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 10.31047248840332, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.06959253549575806, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 22.02491569519043, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.00841795839369297, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 220.98483276367188, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.03541294485330582} -{"step": 1216348160, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 104.49604797363281, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.1475520133972168, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 16.066787719726562, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.00515027204528451, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 7.786255836486816, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.03730985149741173, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 7.617584705352783, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.04636698588728905, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.405559062957764, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.20341260731220245, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.375349044799805, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.2318374216556549, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 15.808334350585938, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.0050026788376271725, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 14.353453636169434, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.17386169731616974, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 10.10250473022461, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.13203559815883636, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 16.568359375, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.005760507192462683, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 9.136195182800293, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.023364536464214325, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 8.002978324890137, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.04789012297987938, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.4769110679626465, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.20152737200260162, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.570974826812744, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.10139987617731094, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 16.007516860961914, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.004847709555178881, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 14.908832550048828, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.13576212525367737, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 10.30996036529541, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.10288587957620621, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 22.08207130432129, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.009975037537515163, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 221.48703002929688, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.058504290878772736} -{"step": 1237319680, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 104.5809555053711, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.13360048830509186, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 16.057687759399414, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.0052274116314947605, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 7.786743640899658, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.035479526966810226, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 7.6168622970581055, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.04205554723739624, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.396687984466553, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.1903104931116104, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.365579605102539, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.22645515203475952, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 15.804343223571777, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.004817938897758722, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 14.350296974182129, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.1719188541173935, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 10.100966453552246, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.13245615363121033, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 16.589025497436523, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.004805389791727066, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 9.221299171447754, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.016991520300507545, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 8.05471420288086, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.03659108281135559, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.4778337478637695, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.18655787408351898, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.573193073272705, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.09894594550132751, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 16.004514694213867, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.004623017273843288, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 14.906132698059082, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.12809917330741882, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 10.30837631225586, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.08456923812627792, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 22.136507034301758, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.009475337341427803, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 221.96705627441406, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.05216225981712341} -{"step": 1258291200, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 104.65850067138672, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.1272680014371872, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 16.048320770263672, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.0046692113392055035, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 7.788728713989258, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.028441142290830612, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 7.617393493652344, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.03424526005983353, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.387336254119873, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.19310450553894043, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.355566024780273, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.2204200178384781, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 15.800963401794434, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.004134305752813816, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 14.34769058227539, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.1721988171339035, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 10.099624633789062, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.12788085639476776, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 16.60940933227539, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.004801348317414522, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 9.304436683654785, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.019767746329307556, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 8.105201721191406, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.04314156621694565, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.4782514572143555, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.1826333850622177, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.574690818786621, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.09904628247022629, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 16.002071380615234, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.004738152027130127, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 14.90420150756836, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.13289879262447357, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 10.307347297668457, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.10430702567100525, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 22.189558029174805, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.009290209040045738, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 222.42138671875, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.0535896010696888} -{"step": 1279262720, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 104.73014831542969, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.11494345963001251, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 16.038358688354492, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.004681846592575312, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 7.787229537963867, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.028928715735673904, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 7.614933490753174, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.03191982954740524, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.378345966339111, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.17164060473442078, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.345926284790039, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.20511020720005035, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 15.797246932983398, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.0037874123081564903, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 14.344351768493652, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.1512414962053299, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 10.097900390625, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.11848525702953339, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 16.629438400268555, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.004491811152547598, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 9.384347915649414, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.014754057861864567, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 8.153512001037598, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.028444180265069008, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.479433536529541, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.16012953221797943, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.5772833824157715, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.09115341305732727, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 15.998897552490234, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.002835627645254135, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 14.901119232177734, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.09456072747707367, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 10.305865287780762, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.06932364404201508, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 22.240562438964844, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.007826875895261765, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 222.85330200195312, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.03421010822057724} -{"step": 1300234240, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 104.79618072509766, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.14263926446437836, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 16.02923011779785, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.004941675346344709, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 7.7891082763671875, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.03709673136472702, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 7.615187168121338, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.04777545481920242, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.369775295257568, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.17706631124019623, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.336864948272705, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.2042408138513565, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 15.79348087310791, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.004447035491466522, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 14.340932846069336, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.16604676842689514, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 10.096111297607422, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.12284209579229355, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 16.64927101135254, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.00422776211053133, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 9.462994575500488, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.023098204284906387, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 8.201200485229492, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.04440094903111458, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.480315685272217, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.15768204629421234, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.579464912414551, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.09896335005760193, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 15.996453285217285, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.0042061880230903625, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 14.898725509643555, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.11527831852436066, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 10.304369926452637, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.08883985131978989, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 22.289342880249023, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.006688675377517939, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 223.26585388183594, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.0452415756881237} -{"step": 1321205760, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 104.8567123413086, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.11857014894485474, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 16.0198917388916, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.004436730407178402, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 7.788577556610107, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.029121126979589462, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 7.613683700561523, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.03271955996751785, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.361091136932373, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.164690762758255, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.327786445617676, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.2024451196193695, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 15.790315628051758, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.004043704364448786, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 14.33820915222168, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.15763472020626068, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 10.09485149383545, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.12189678102731705, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 16.66850471496582, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.004392622038722038, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 9.539444923400879, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.01755296252667904, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 8.247448921203613, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.03880719095468521, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.480771541595459, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.15628860890865326, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.581092834472656, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.08603177964687347, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 15.994085311889648, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.003462007036432624, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 14.896058082580566, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.10188191384077072, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 10.30275821685791, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.07695911079645157, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 22.335765838623047, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.008624610491096973, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 223.6548614501953, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.046026069670915604} -{"step": 1342177280, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 104.91075897216797, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.12170950323343277, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 16.010746002197266, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.004431409761309624, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 7.786645412445068, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.03313353285193443, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 7.6111836433410645, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.04069327190518379, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.352966785430908, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.176702618598938, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.31931734085083, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.20033012330532074, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 15.786845207214355, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.004351912532001734, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 14.334834098815918, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.16272664070129395, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 10.09317398071289, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.1252373605966568, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 16.687129974365234, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.004821686539798975, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 9.613900184631348, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.01613456942141056, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 8.292176246643066, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.0319434218108654, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.481362819671631, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.15180498361587524, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.582886219024658, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.09238935261964798, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 15.991327285766602, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.004733817186206579, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 14.89282512664795, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.1206054762005806, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 10.301121711730957, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.1011824905872345, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 22.380338668823242, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.009671713225543499, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 224.02049255371094, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.03705446422100067} -{"step": 1363148800, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 104.96006774902344, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.15101604163646698, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 16.00162696838379, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.004555465187877417, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 7.784909725189209, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.044991254806518555, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 7.6089653968811035, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.05969297140836716, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.34491491317749, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.18910515308380127, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.310740947723389, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.23826801776885986, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 15.783642768859863, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.004658289719372988, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 14.331559181213379, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.1823592185974121, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 10.091464042663574, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.13891425728797913, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 16.705307006835938, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.005336349364370108, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 9.685985565185547, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.020020414143800735, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 8.335731506347656, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.047100476920604706, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.482053279876709, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.17670777440071106, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.584695816040039, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.09754883497953415, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 15.989062309265137, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.0039369105361402035, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 14.890276908874512, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.11927562952041626, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 10.299851417541504, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.0924556776881218, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 22.423070907592773, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.006941912695765495, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 224.3703155517578, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.04893224313855171} -{"step": 1384120320, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 105.00439453125, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.11454953253269196, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 15.992609024047852, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.003914573695510626, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 7.7845587730407715, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.03438938409090042, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 7.607541084289551, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.04308881238102913, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.336729526519775, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.15445902943611145, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.302294731140137, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.1878441572189331, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 15.780850410461426, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.0038933141622692347, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 14.328875541687012, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.14343276619911194, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 10.089935302734375, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.10762016475200653, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 16.722806930541992, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.00394308241084218, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 9.754260063171387, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.014498825185000896, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 8.377254486083984, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.027879858389496803, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.48225212097168, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.15463045239448547, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.585851192474365, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.08453239500522614, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 15.987366676330566, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.002727166283875704, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 14.888596534729004, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.09225565195083618, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 10.299179077148438, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.06625384837388992, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 22.4639835357666, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.00832317117601633, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 224.69915771484375, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.0348736010491848} -{"step": 1405091840, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 105.04485321044922, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.10776063054800034, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 15.984082221984863, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.0046220398508012295, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 7.782839298248291, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.02853499911725521, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 7.605199813842773, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.03395810350775719, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.328843116760254, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.17118795216083527, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.294166088104248, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.20704752206802368, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 15.778287887573242, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.004529116675257683, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 14.326297760009766, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.16955240070819855, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 10.088642120361328, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.12500374019145966, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 16.739904403686523, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.0046305288560688496, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 9.8208589553833, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.01559215597808361, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 8.41758918762207, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.03440426290035248, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.4825825691223145, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.15861199796199799, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.587190628051758, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.09413189440965652, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 15.985316276550293, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.004461411386728287, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 14.886176109313965, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.11659097671508789, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 10.298138618469238, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.09002812951803207, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 22.502378463745117, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.007352745626121759, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 225.00657653808594, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.043288704007864} -{"step": 1426063360, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 105.08110046386719, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.10275556147098541, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 15.976288795471191, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.003922951407730579, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 7.782670974731445, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.027309050783514977, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 7.604121208190918, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.03114037960767746, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.321544170379639, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.15344946086406708, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.286844730377197, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.18376995623111725, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 15.77593994140625, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.003987899050116539, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 14.323578834533691, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.15434525907039642, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 10.08726978302002, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.11678999662399292, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 16.756553649902344, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.00418573385104537, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 9.884653091430664, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.014013128355145454, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 8.455832481384277, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.025785217061638832, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.482873916625977, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.14764785766601562, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.58839750289917, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.08470213413238525, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 15.983375549316406, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.003539649536833167, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 14.883672714233398, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.10081454366445541, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 10.296927452087402, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.08205123245716095, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 22.538724899291992, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.006568888667970896, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 225.29759216308594, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.03190699219703674} -{"step": 1447034880, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 105.11325073242188, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.10946369916200638, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 15.967955589294434, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.0039581842720508575, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 7.781191825866699, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.027199583128094673, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 7.602139949798584, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.03299187868833542, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.314119815826416, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.1553051918745041, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.279312610626221, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.1950763761997223, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 15.773536682128906, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.0046556610614061356, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 14.320898056030273, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.16620342433452606, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 10.085801124572754, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.12038001418113708, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 16.772998809814453, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.004473560489714146, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 9.946532249450684, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.01622859388589859, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 8.49337100982666, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.034376055002212524, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.483179569244385, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.14839503169059753, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.589543342590332, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.08354029804468155, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 15.982089042663574, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.0030387011356651783, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 14.881844520568848, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.09304943680763245, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 10.296289443969727, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.06938563287258148, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 22.573314666748047, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.008636982180178165, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 225.56980895996094, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.03809318318963051} -{"step": 1468006400, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 105.1415786743164, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.11291490495204926, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 15.960134506225586, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.003792615607380867, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 7.780763149261475, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.02644355408847332, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 7.601221084594727, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.02990967221558094, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.307137966156006, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.16019825637340546, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.272159099578857, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.20339180529117584, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 15.771221160888672, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.004676259122788906, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 14.318153381347656, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.1720096915960312, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 10.08446216583252, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.12497718632221222, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 16.78948211669922, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.004368082620203495, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 10.00711441040039, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.015461141243577003, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 8.53041934967041, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.03486338257789612, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.483688831329346, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.15397238731384277, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.5908308029174805, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.08341789245605469, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 15.980478286743164, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.003463928820565343, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 14.879532814025879, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.0938824936747551, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 10.295171737670898, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.07261717319488525, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 22.606037139892578, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.00879728700965643, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 225.8235321044922, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.03865448012948036} -{"step": 1488977920, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 105.16716003417969, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.09474637359380722, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 15.952977180480957, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.003313617315143347, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 7.780930042266846, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.02625383995473385, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 7.600545883178711, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.03009386546909809, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.300400257110596, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.1332627683877945, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.265468597412109, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.17032815515995026, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 15.76949691772461, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.0037846777122467756, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 14.316192626953125, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.14025945961475372, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 10.083727836608887, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.10542618483304977, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 16.804824829101562, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.0037386587355285883, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 10.06413459777832, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.013089272193610668, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 8.56533432006836, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.026447761803865433, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.483724117279053, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.11786185204982758, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.591488361358643, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.0721823051571846, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 15.978838920593262, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.002163854194805026, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 14.877272605895996, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.07003485411405563, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 10.294219970703125, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.05328155308961868, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 22.637001037597656, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.007725835312157869, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 226.0631103515625, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.03116585873067379} -{"step": 1509949440, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 105.1893081665039, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.09616431593894958, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 15.945502281188965, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.003731260308995843, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 7.780364513397217, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.025776799768209457, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 7.5994110107421875, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.02894810028374195, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.2935895919799805, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.14501148462295532, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.258706092834473, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.18271701037883759, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 15.767884254455566, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.0038722965400666, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 14.314229965209961, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.1616472601890564, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 10.08296012878418, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.11489499360322952, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 16.819913864135742, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.004655543249100447, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 10.120105743408203, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.013627782464027405, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 8.599587440490723, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.02758590318262577, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.48386812210083, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.13055971264839172, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.592244625091553, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.07647950202226639, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 15.977496147155762, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.0025293338112533092, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 14.875557899475098, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.0748496949672699, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 10.293652534484863, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.05402769148349762, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 22.666221618652344, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.007985805161297321, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 226.28848266601562, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.032623760402202606} -{"step": 1530920960, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 105.208740234375, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.11885122209787369, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 15.937934875488281, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.003780374303460121, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 7.777398586273193, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.030572116374969482, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 7.596177101135254, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.03932669758796692, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.2869977951049805, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.16167934238910675, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.2521748542785645, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.19999396800994873, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 15.766268730163574, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.004101016093045473, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 14.312101364135742, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.16358983516693115, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 10.082008361816406, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.1224544420838356, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 16.83465576171875, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.004700262565165758, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 10.17430591583252, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.015791254118084908, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 8.63293743133545, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.0362454429268837, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.484335422515869, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.14840804040431976, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.593480587005615, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.08537206053733826, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 15.976156234741211, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.002788258483633399, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 14.873605728149414, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.08814160525798798, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 10.29284954071045, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.06855352222919464, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 22.693689346313477, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.007437325548380613, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 226.49749755859375, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.03896424174308777} -{"step": 1551892480, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 105.22602844238281, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.09752948582172394, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 15.931206703186035, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.003461882472038269, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 7.776454925537109, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.029193907976150513, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 7.594911575317383, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.03332861140370369, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.280959129333496, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.1408063918352127, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.246185302734375, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.1744159758090973, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 15.764959335327148, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.004033285658806562, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 14.31044864654541, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.15407268702983856, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 10.081311225891113, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.11261903494596481, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 16.848499298095703, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.004203846212476492, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 10.225205421447754, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.013569767586886883, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 8.664295196533203, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.02863919362425804, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.484498977661133, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.11848782747983932, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.594238758087158, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.07591791450977325, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 15.974848747253418, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.00241569965146482, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 14.871689796447754, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.07506651431322098, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 10.292160034179688, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.05651874840259552, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 22.719440460205078, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.007492795120924711, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 226.69143676757812, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.03130144253373146} -{"step": 1572864000, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 105.2407455444336, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.10131721198558807, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 15.92454719543457, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.0032635615207254887, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 7.774651050567627, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.02563616819679737, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 7.592945098876953, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.028658054769039154, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.275164604187012, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.13505348563194275, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.240517616271973, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.17240755259990692, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 15.763278007507324, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.0037882826291024685, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 14.3082914352417, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.1492205560207367, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 10.08044147491455, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.1117386445403099, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 16.86151695251465, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.004540945868939161, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 10.272866249084473, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.013445100747048855, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 8.693672180175781, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.029003243893384933, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.484742641448975, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.12020156532526016, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.595147609710693, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.07408780604600906, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 15.973841667175293, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.002655717544257641, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 14.870183944702148, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.08133725076913834, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 10.291566848754883, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.06469264626502991, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 22.743501663208008, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.008108548820018768, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 226.87269592285156, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.03453692048788071} -{"step": 1593835520, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 105.25347900390625, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.09137298911809921, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 15.918785095214844, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.0032889083959162235, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 7.774447917938232, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.027927150949835777, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 7.592182159423828, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.03208420053124428, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.269485950469971, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.1366303265094757, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.234969615936279, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.16805164515972137, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 15.761789321899414, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.0036996847484260798, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 14.306349754333496, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.14369328320026398, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 10.079633712768555, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.10789044946432114, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 16.873836517333984, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.004618688952177763, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 10.317220687866211, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.012598443776369095, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 8.721356391906738, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.023842865601181984, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.485154151916504, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.12085511535406113, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.596216201782227, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.07368342578411102, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 15.972471237182617, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.0025277379900217056, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 14.868106842041016, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.08059880882501602, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 10.290847778320312, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.05877021700143814, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 22.76593780517578, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.008037506602704525, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 227.03941345214844, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.0321200005710125} -{"step": 1614807040, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 105.26421356201172, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.08787043392658234, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 15.913439750671387, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.0031294364016503096, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 7.774327754974365, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.023558247834444046, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 7.591564655303955, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.02605956606566906, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.264425277709961, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.122857004404068, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.230173587799072, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.15322376787662506, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 15.760953903198242, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.0033138394355773926, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 14.305314064025879, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.13100634515285492, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 10.079320907592773, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.10059888660907745, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 16.884981155395508, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.003952172584831715, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 10.357884407043457, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.011999639682471752, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 8.746387481689453, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.023166241124272346, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.485085487365723, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.11090218275785446, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.596572399139404, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.06551910936832428, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 15.971607208251953, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.0025065818335860968, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 14.866762161254883, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.06763797998428345, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 10.290376663208008, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.052783843129873276, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 22.78650665283203, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.007893760688602924, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 227.1929931640625, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.031105101108551025} -{"step": 1635778560, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 105.27302551269531, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.09059833735227585, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 15.908577919006348, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.0032671631779521704, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 7.77403450012207, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.02527041547000408, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 7.590904712677002, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.02858254872262478, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.259827613830566, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.13302595913410187, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.22571325302124, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.17384406924247742, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 15.760133743286133, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.003793734358623624, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 14.304100036621094, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.15005412697792053, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 10.07873821258545, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.11010083556175232, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 16.895597457885742, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.004253936465829611, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 10.396379470825195, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.012709819711744785, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 8.770458221435547, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.027553707361221313, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.484922885894775, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.1298942118883133, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.596819877624512, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.07113727927207947, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 15.970739364624023, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.0018608393147587776, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 14.86536979675293, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.07104645669460297, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 10.289876937866211, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.05277350917458534, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 22.805435180664062, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.007084329146891832, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 227.33383178710938, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.02966778166592121} -{"step": 1656750080, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 105.2807388305664, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.0942593589425087, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 15.903703689575195, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.0034006070345640182, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 7.772775173187256, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.026835232973098755, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 7.589418411254883, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.03427470102906227, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.255485534667969, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.12467600405216217, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.221507549285889, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.15539175271987915, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 15.759206771850586, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.0038461347576230764, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 14.302740097045898, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.12804371118545532, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 10.078154563903809, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.09683544188737869, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 16.905473709106445, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.0039755795150995255, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 10.431756019592285, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.011915567331016064, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 8.792543411254883, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.023111114278435707, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.484803676605225, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.09956011176109314, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.597021102905273, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.06554512679576874, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 15.970329284667969, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.0018669459968805313, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 14.864579200744629, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.06729796528816223, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 10.289787292480469, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.05316311866044998, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 22.82280158996582, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.007672174368053675, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 227.46136474609375, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.027451220899820328} -{"step": 1677721600, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 105.2868881225586, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.08582116663455963, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 15.898841857910156, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.003150941338390112, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 7.770954132080078, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.02458132430911064, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 7.587620258331299, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.02702314220368862, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.2511515617370605, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.12406113743782043, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.217303276062012, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.164878249168396, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 15.758306503295898, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.003341387491673231, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 14.301370620727539, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.13342683017253876, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 10.077582359313965, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.1003832146525383, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 16.914691925048828, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.004116953816264868, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 10.463969230651855, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.011589547619223595, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 8.81286334991455, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.023118598386645317, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.485008716583252, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.11205044388771057, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.597588062286377, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.06752871721982956, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 15.969786643981934, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.0022099700290709734, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 14.86364459991455, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.07275651395320892, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 10.289613723754883, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.05565140023827553, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 22.838764190673828, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.006910502444952726, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 227.57754516601562, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.027738498523831367} -{"step": 1698693120, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 105.29204559326172, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.08258400112390518, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 15.894960403442383, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.002568805357441306, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 7.770502090454102, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.02330668270587921, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 7.586980819702148, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.0238662026822567, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.2474517822265625, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.11563961952924728, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.213745594024658, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.1456437110900879, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 15.757453918457031, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.0030286016408354044, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 14.300082206726074, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.12072642892599106, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 10.076980590820312, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.09368055313825607, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 16.92308235168457, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.003402759786695242, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 10.493508338928223, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.01075887680053711, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 8.83148193359375, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.02051658369600773, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.485095500946045, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.10139713436365128, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.598071575164795, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.06405708938837051, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 15.969269752502441, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.0019608878064900637, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 14.862642288208008, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.06299268454313278, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 10.289419174194336, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.04949929937720299, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 22.85329818725586, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.00729849748313427, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 227.68362426757812, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.027540089562535286} -{"step": 1719664640, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 105.29623413085938, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.082738496363163, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 15.891094207763672, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.0025276844389736652, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 7.770347595214844, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.023943115025758743, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 7.586646556854248, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.02605525217950344, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.243767261505127, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.10653405636548996, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.210249900817871, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.13877643644809723, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 15.756790161132812, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.002946089254692197, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 14.298982620239258, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.11562099307775497, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 10.07651138305664, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.08672082424163818, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 16.93082618713379, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.003336886875331402, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 10.520662307739258, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.010991711169481277, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 8.848628997802734, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.02356633171439171, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.485200881958008, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.09534643590450287, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.598484039306641, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.061295900493860245, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 15.968879699707031, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.0021490901708602905, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 14.861794471740723, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.06676916033029556, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 10.289216041564941, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.05297986418008804, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 22.86643409729004, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.007028650026768446, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 227.77928161621094, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.025983303785324097} -{"step": 1740636160, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 105.29946899414062, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.08135715126991272, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 15.88766860961914, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.002952985232695937, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 7.769961357116699, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.02263253927230835, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 7.586100101470947, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.02432413212954998, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.240571975708008, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.11712862551212311, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.2071533203125, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.14098580181598663, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 15.756162643432617, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.003137198043987155, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 14.29790210723877, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.12652112543582916, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 10.076104164123535, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.09140706807374954, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 16.93802261352539, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.0037898218724876642, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 10.54545783996582, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.01014868076890707, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 8.864377975463867, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.019806653261184692, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.485297203063965, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.09353243559598923, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.598913669586182, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.06041235104203224, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 15.96853256225586, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.002332388423383236, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 14.861051559448242, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.06257802993059158, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 10.289191246032715, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.04769761860370636, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 22.878286361694336, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.007740238215774298, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 227.8644256591797, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.02445443533360958} -{"step": 1761607680, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 105.30183410644531, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.08301523327827454, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 15.884284973144531, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.002986710984259844, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 7.768685340881348, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.02549327351152897, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 7.584722518920898, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.02643347717821598, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.2374701499938965, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.1131570115685463, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.204189300537109, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.14682981371879578, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 15.755735397338867, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.0032498070504516363, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 14.297112464904785, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.12789121270179749, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 10.075812339782715, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.09592141211032867, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 16.944622039794922, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.0038961938116699457, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 10.568089485168457, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.011474471539258957, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 8.878920555114746, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.021898027509450912, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.485411167144775, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.10160917043685913, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.59935188293457, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.06144299730658531, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 15.968109130859375, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.0022265382576733828, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 14.860191345214844, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.0647740289568901, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 10.288993835449219, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.050749968737363815, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 22.888843536376953, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.007976936176419258, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 227.93971252441406, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.026957180351018906} -{"step": 1782579200, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 105.30370330810547, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.07927730679512024, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 15.881589889526367, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.002693441929295659, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 7.768372535705566, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.025186611339449883, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 7.584235668182373, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.02624284103512764, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.2347798347473145, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.0998387336730957, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.201614856719971, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.12936028838157654, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 15.755196571350098, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.0028796421829611063, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 14.296255111694336, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.11523925513029099, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 10.075522422790527, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.08761660009622574, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 16.950559616088867, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.003360658884048462, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 10.588417053222656, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.011239411309361458, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 8.89210033416748, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.02134547382593155, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.485527515411377, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.09042518585920334, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.599705219268799, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.06007709354162216, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 15.967744827270508, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.0018105562776327133, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 14.859458923339844, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.05939112976193428, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 10.288792610168457, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.046413179486989975, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 22.898178100585938, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.007792770862579346, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 228.00672912597656, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.028985053300857544} -{"step": 1803550720, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 105.30525207519531, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.0770765170454979, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 15.878975868225098, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.0024981722235679626, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 7.767946243286133, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.021433446556329727, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 7.583635330200195, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.023308278992772102, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.232335090637207, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.09752292186021805, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.199291229248047, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.12312071770429611, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 15.754823684692383, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.002576131373643875, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 14.295553207397461, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.1045280173420906, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 10.075255393981934, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.07879471033811569, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 16.95571517944336, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.00335111771710217, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 10.605810165405273, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.009705595672130585, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 8.903463363647461, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.019427426159381866, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.485556125640869, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.08164845407009125, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.599891662597656, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.05422954261302948, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 15.96755599975586, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.0015634829178452492, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 14.85898494720459, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.05392397940158844, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 10.288763999938965, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.04137830436229706, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 22.90637969970703, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.007748925127089024, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 228.06512451171875, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.025257451459765434} -{"step": 1824522240, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 105.3062744140625, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.07954614609479904, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 15.876935005187988, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.002664984669536352, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 7.767707347869873, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.027272408828139305, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 7.583344459533691, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.032549865543842316, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.230284690856934, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.09768880903720856, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.1973490715026855, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.12598533928394318, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 15.754560470581055, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.0023990734480321407, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 14.294981956481934, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.10500999540090561, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 10.07510757446289, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.08036435395479202, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 16.96027374267578, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.0032978705130517483, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 10.620956420898438, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.010981857776641846, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 8.9133939743042, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.019771188497543335, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.485602855682373, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.07821982353925705, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.600110054016113, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.05461509898304939, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 15.967227935791016, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.0018130993703380227, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 14.858348846435547, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.05494045093655586, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 10.288603782653809, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.04276328906416893, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 22.913549423217773, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.007678695023059845, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 228.11581420898438, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.025038929656147957} -{"step": 1845493760, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 105.3069839477539, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.0833892896771431, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 15.874884605407715, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.0024224643129855394, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 7.767399311065674, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.024699125438928604, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 7.583018779754639, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.027913548052310944, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.228301525115967, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.09889056533575058, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.1954803466796875, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.1315154880285263, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 15.75438404083252, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.0027602538466453552, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 14.294524192810059, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.11701829731464386, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 10.075027465820312, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.08582821488380432, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 16.964378356933594, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.0037905096542090178, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 10.634467124938965, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.01245292741805315, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 8.922295570373535, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.02779087983071804, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.485658168792725, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.09573546797037125, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.60030460357666, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.05733238533139229, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 15.9668607711792, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.0019078869372606277, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 14.857641220092773, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.061389029026031494, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 10.288381576538086, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.04852864146232605, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 22.91971206665039, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.006761385127902031, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 228.15911865234375, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.02687910385429859} -{"step": 1866465280, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 105.30747985839844, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.0726909264922142, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 15.873202323913574, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.002775400411337614, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 7.767306327819824, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.022367333993315697, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 7.582931041717529, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.025684518739581108, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.226651668548584, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.09361670166254044, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.193917274475098, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.12418487668037415, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 15.754112243652344, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.002764130709692836, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 14.294015884399414, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.11133955419063568, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 10.074820518493652, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.0817263051867485, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 16.967975616455078, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.003310619620606303, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 10.646292686462402, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.009962246753275394, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 8.930107116699219, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.019632600247859955, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.485745906829834, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.08591790497303009, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.600498199462891, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.05701206997036934, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 15.966755867004395, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.0019019626779481769, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 14.857278823852539, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.06164992228150368, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 10.288395881652832, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.04628751054406166, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 22.924955368041992, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.0064211455173790455, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 228.19577026367188, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.024454280734062195} -{"step": 1887436800, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 105.30777740478516, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.07317695766687393, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 15.871648788452148, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.0025591079611331224, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 7.766999244689941, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.02192692831158638, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 7.5825886726379395, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.023449493572115898, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.225192546844482, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.0950515940785408, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.192529678344727, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.12211092561483383, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 15.754042625427246, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.0026701700408011675, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 14.293770790100098, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.11063097417354584, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 10.074752807617188, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.08072952181100845, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 16.970979690551758, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.0033711371943354607, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 10.656081199645996, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.010740889236330986, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 8.93657112121582, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.02113603614270687, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.485791206359863, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.07680659741163254, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.600643634796143, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.054718777537345886, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 15.966556549072266, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.0014678615843877196, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 14.856805801391602, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.05298593267798424, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 10.288273811340332, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.041318561881780624, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 22.929332733154297, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.007524071726948023, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 228.2264404296875, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.02490144781768322} -{"step": 1908408320, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 105.30787658691406, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.0783185213804245, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 15.870502471923828, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.002759814728051424, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 7.766800880432129, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.02124878577888012, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 7.58237886428833, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.024322882294654846, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.224062919616699, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.10034111887216568, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.191442012786865, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.12295131385326385, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 15.753885269165039, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.0025354160461574793, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 14.293444633483887, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.10041908174753189, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 10.074633598327637, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.07724865525960922, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 16.973539352416992, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.0030761363450437784, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 10.664387702941895, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.009914977476000786, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 8.942087173461914, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.017682835459709167, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.485882759094238, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.08517009764909744, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.600836277008057, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.05420570820569992, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 15.966334342956543, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.0015557444421574473, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 14.856365203857422, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.050484798848629, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 10.288137435913086, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.03878837451338768, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 22.932939529418945, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.007259681820869446, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 228.25177001953125, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.023209910839796066} -{"step": 1929379840, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 105.30796813964844, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.07304441183805466, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 15.869444847106934, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.0025357448030263186, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 7.766559600830078, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.023417653515934944, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 7.58211612701416, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.023926716297864914, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.223062515258789, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.08693817257881165, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.19049072265625, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.11245308816432953, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 15.753801345825195, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.0023753230925649405, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 14.293249130249023, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.09824188798666, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 10.07458209991455, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.0737084373831749, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 16.97557258605957, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.003250150242820382, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 10.670979499816895, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.009210139513015747, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 8.946492195129395, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.017428014427423477, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.485898971557617, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.07025467604398727, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.600912094116211, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.05143967270851135, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 15.966230392456055, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.001410463242791593, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 14.85610294342041, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.05107097700238228, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 10.288073539733887, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.03957672044634819, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 22.93583106994629, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.007016855292022228, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 228.27200317382812, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.023509446531534195} -{"step": 1950351360, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 105.30799102783203, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.0708552822470665, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 15.868549346923828, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.002169492421671748, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 7.766197681427002, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.02314005047082901, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 7.581750869750977, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.02516099438071251, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.222243309020996, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.08394356817007065, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.189713478088379, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.10414139181375504, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 15.75374698638916, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.0020766614470630884, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 14.293100357055664, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.09195587038993835, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 10.074541091918945, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.07041405886411667, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 16.977136611938477, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.0029360123444348574, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 10.676013946533203, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.009264507330954075, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 8.949865341186523, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.017856428399682045, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.485900402069092, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.07165051251649857, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.600946426391602, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.04943997040390968, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 15.96623420715332, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.0014284662902355194, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 14.855978965759277, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.05014224350452423, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 10.288086891174316, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.03702230751514435, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 22.93808364868164, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.007235592231154442, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 228.2876434326172, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.02410583384335041} -{"step": 1971322880, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 105.30796813964844, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.06929617375135422, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 15.867960929870605, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.0021868422627449036, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 7.76600456237793, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.02202497236430645, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 7.58154821395874, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.022933876141905785, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.2216596603393555, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.08753909170627594, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.189162254333496, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.11042042821645737, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 15.753679275512695, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.0022353141102939844, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 14.292943954467773, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.09020721167325974, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 10.074493408203125, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.06956246495246887, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 16.978351593017578, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.003324548713862896, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 10.679862976074219, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.009175158105790615, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 8.9524564743042, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.016225401312112808, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.485929012298584, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.07341552525758743, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.601015090942383, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.0513673759996891, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 15.966200828552246, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.0014639041619375348, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 14.855842590332031, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.04850001260638237, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 10.288080215454102, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.036977093666791916, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 22.939783096313477, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.008106520399451256, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 228.2993927001953, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.023978501558303833} -{"step": 1992294400, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 105.30790710449219, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.07455417513847351, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 15.867557525634766, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.0024499220307916403, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 7.765953063964844, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.024226337671279907, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 7.58148193359375, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.028247008100152016, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.221253395080566, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.08643943816423416, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.1887736320495605, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.10681148618459702, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 15.753591537475586, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.0020839455537497997, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 14.292778015136719, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.09574875235557556, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 10.074426651000977, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.0701284408569336, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 16.979246139526367, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.0033866658341139555, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 10.682668685913086, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.009271507151424885, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 8.954351425170898, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.017369091510772705, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.485957622528076, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.06717965006828308, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.601077556610107, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.047580331563949585, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 15.966156959533691, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.0011962542776018381, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 14.85572338104248, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.046751923859119415, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 10.288070678710938, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.03510065749287605, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 22.941011428833008, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.007207591086626053, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 228.307861328125, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.022721780464053154} -{"step": 2013265920, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 105.30786895751953, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.06951750069856644, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 15.86723804473877, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.0024551358073949814, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 7.765934944152832, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.023639313876628876, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 7.5814528465271, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.02369207888841629, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.220944404602051, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.08148231357336044, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.188490867614746, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.10246472805738449, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 15.753560066223145, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.0021781688556075096, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 14.292693138122559, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.08974441140890121, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 10.074399948120117, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.06681162118911743, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 16.9798526763916, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.003002544166520238, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 10.684564590454102, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.009060111828148365, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 8.955626487731934, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.016890257596969604, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.485980987548828, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.06181700900197029, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.601123809814453, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.046648502349853516, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 15.966113090515137, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.001191629795357585, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 14.85561752319336, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.046260058879852295, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 10.288046836853027, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.03640136495232582, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 22.94183921813965, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.00713229738175869, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 228.31350708007812, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.02305736392736435} -{"step": 2034237440, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 105.3078384399414, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.07081601023674011, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 15.867043495178223, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.0022190369199961424, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 7.765921115875244, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.0227583646774292, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 7.5814313888549805, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.02407023124396801, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.220749378204346, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.07948136329650879, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.1883134841918945, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.10079348087310791, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 15.753552436828613, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.002116377931088209, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 14.29266357421875, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.08930982649326324, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 10.074396133422852, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.06716953217983246, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 16.9802188873291, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.00337780499830842, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 10.685711860656738, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.009380814619362354, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 8.956403732299805, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.017015352845191956, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.48599100112915, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.06443130970001221, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.601144790649414, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.04685555770993233, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 15.966104507446289, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.0011197353014722466, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 14.855579376220703, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.04511329159140587, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 10.288046836853027, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.034088630229234695, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 22.94234275817871, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.007591011468321085, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 228.3169708251953, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.023064933717250824} -{"step": 2055208960, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 105.30782318115234, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.06833957135677338, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 15.86694622039795, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.002288439543917775, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 7.765910625457764, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.022526917979121208, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 7.581418991088867, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.02348254807293415, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.2206501960754395, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.08163854479789734, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.1882219314575195, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.0987466424703598, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 15.753545761108398, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.002164694247767329, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 14.292642593383789, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.0851939395070076, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 10.07438850402832, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.06491274386644363, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 16.98041534423828, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.0029752489645034075, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 10.686324119567871, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.00917787291109562, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 8.956820487976074, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.01617969200015068, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.485995292663574, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.06191352382302284, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.601153373718262, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.04606229439377785, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 15.966094017028809, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.001312306965701282, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 14.855545997619629, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.046333443373441696, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 10.288039207458496, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.03536451980471611, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 22.942602157592773, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.007678853813558817, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 228.3187713623047, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.02323606237769127} +{"step": 20971520, "pnorm/_forward_module._fsdp_wrapped_module.emb.weight": 3585.87890625, "gnorm/_forward_module._fsdp_wrapped_module.emb.weight": 0.006323995068669319, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 15.986734390258789, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 0.005813214927911758, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 9.22622013092041, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 0.011783506721258163, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 9.264309883117676, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 0.0109311044216156, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 9.211417198181152, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 0.08205569535493851, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 9.24279499053955, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 0.07186440378427505, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 16.044992446899414, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 0.015252172015607357, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 22.769420623779297, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 0.0947781428694725, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 9.377448081970215, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 0.12298791110515594, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 15.985852241516113, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 0.0026790020056068897, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 9.240193367004395, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 0.008056988939642906, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 9.249547958374023, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 0.008285393007099628, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 9.202188491821289, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 0.058652233332395554, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 9.217767715454102, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 0.08365912735462189, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 16.04620933532715, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 0.030267445370554924, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 22.766273498535156, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 0.22848264873027802, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 9.389978408813477, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 0.30379021167755127, "pnorm/_forward_module._fsdp_wrapped_module.norm.weight": 15.95506477355957, "gnorm/_forward_module._fsdp_wrapped_module.norm.weight": 0.04823115095496178, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 127.22613525390625, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.8972803950309753} +{"step": 41943040, "pnorm/_forward_module._fsdp_wrapped_module.emb.weight": 3581.544921875, "gnorm/_forward_module._fsdp_wrapped_module.emb.weight": 0.0015263058012351394, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 15.954889297485352, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 0.002032625488936901, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 9.299200057983398, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 0.009988500736653805, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 9.341612815856934, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 0.008338921703398228, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 9.325764656066895, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 0.02128537744283676, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 9.381427764892578, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 0.01764068752527237, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 16.153043746948242, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 0.0010973397875204682, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 23.453269958496094, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 0.012549151666462421, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 10.00796127319336, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 0.031537674367427826, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 15.968523025512695, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 0.00026522655389271677, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 9.370798110961914, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 0.0015512247337028384, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 9.37192440032959, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 0.0023018009960651398, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 9.284553527832031, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 0.003514921525493264, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 9.30654239654541, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 0.004735336638987064, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 16.141569137573242, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 0.0008969121263362467, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 23.290735244750977, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 0.008646462112665176, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 9.954242706298828, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 0.0351794958114624, "pnorm/_forward_module._fsdp_wrapped_module.norm.weight": 16.039657592773438, "gnorm/_forward_module._fsdp_wrapped_module.norm.weight": 0.07284313440322876, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 129.6841278076172, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.6506856083869934} +{"step": 62914560, "pnorm/_forward_module._fsdp_wrapped_module.emb.weight": 3577.65625, "gnorm/_forward_module._fsdp_wrapped_module.emb.weight": 0.0032590709161013365, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 15.93447494506836, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 0.003908559679985046, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 9.402969360351562, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 0.022825980558991432, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 9.433878898620605, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 0.015147489495575428, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 9.473119735717773, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 0.028542323037981987, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 9.524731636047363, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 0.02294442057609558, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 16.218891143798828, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 0.0018193618161603808, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 24.007732391357422, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 0.020912881940603256, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 10.516311645507812, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 0.05653133988380432, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 15.963053703308105, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 0.0003478245052974671, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 9.499908447265625, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 0.0013518866617232561, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 9.494308471679688, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 0.002075089840218425, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 9.37143611907959, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 0.005934854503720999, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 9.39819049835205, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 0.010034997016191483, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 16.162311553955078, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 0.001822464051656425, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 23.602169036865234, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 0.013838428072631359, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 10.346477508544922, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 0.03304202854633331, "pnorm/_forward_module._fsdp_wrapped_module.norm.weight": 16.13155174255371, "gnorm/_forward_module._fsdp_wrapped_module.norm.weight": 0.012952751480042934, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 134.32614135742188, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.27288368344306946} +{"step": 83886080, "pnorm/_forward_module._fsdp_wrapped_module.emb.weight": 3574.011474609375, "gnorm/_forward_module._fsdp_wrapped_module.emb.weight": 0.004218939691781998, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 15.92776107788086, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 0.004567786585539579, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 9.525163650512695, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 0.028872549533843994, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 9.537534713745117, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 0.026383012533187866, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 9.603059768676758, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 0.04415915533900261, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 9.631916046142578, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 0.031069913879036903, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 16.2464656829834, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 0.0032430379651486874, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 24.307933807373047, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 0.027778148651123047, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 10.82164478302002, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 0.06485233455896378, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 15.95949649810791, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 0.0010236374801024795, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 9.5707368850708, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 0.006540937814861536, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 9.568937301635742, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 0.005721134599298239, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 9.456015586853027, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 0.008030996657907963, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 9.485960006713867, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 0.007060752250254154, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 16.14731216430664, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 0.0005494338111020625, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 23.704490661621094, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 0.009207009337842464, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 10.55056095123291, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 0.01256099808961153, "pnorm/_forward_module._fsdp_wrapped_module.norm.weight": 16.169767379760742, "gnorm/_forward_module._fsdp_wrapped_module.norm.weight": 0.015665628015995026, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 137.54031372070312, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.21493282914161682} +{"step": 104857600, "pnorm/_forward_module._fsdp_wrapped_module.emb.weight": 3570.470947265625, "gnorm/_forward_module._fsdp_wrapped_module.emb.weight": 0.005424803122878075, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 15.935561180114746, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 0.005046219099313021, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 9.680547714233398, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 0.036949943751096725, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 9.67404842376709, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 0.03656087815761566, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 9.72243595123291, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 0.08290404081344604, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 9.71654224395752, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 0.070151686668396, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 16.224117279052734, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 0.005141772795468569, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 24.43528175354004, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 0.052713312208652496, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 11.013069152832031, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 0.08725735545158386, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 15.959779739379883, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 0.0068303123116493225, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 9.636144638061523, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 0.011727717705070972, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 9.616986274719238, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 0.013453400693833828, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 9.507692337036133, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 0.05718284472823143, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 9.54469108581543, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 0.06052389368414879, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 16.129188537597656, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 0.00703220022842288, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 23.742788314819336, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 0.07743687927722931, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 10.650816917419434, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 0.03275134414434433, "pnorm/_forward_module._fsdp_wrapped_module.norm.weight": 16.22039222717285, "gnorm/_forward_module._fsdp_wrapped_module.norm.weight": 0.012110945768654346, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 139.7522430419922, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.16124582290649414} +{"step": 125829120, "pnorm/_forward_module._fsdp_wrapped_module.emb.weight": 3566.9765625, "gnorm/_forward_module._fsdp_wrapped_module.emb.weight": 0.005867136642336845, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 15.938095092773438, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 0.005842261947691441, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 9.810138702392578, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 0.04339815676212311, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 9.777499198913574, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 0.041711557656526566, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 9.81181526184082, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 0.075184166431427, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 9.776444435119629, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 0.07119052857160568, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 16.191692352294922, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 0.006242226343601942, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 24.506282806396484, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 0.05342089757323265, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 11.153679847717285, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 0.06428540498018265, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 15.958102226257324, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 0.005311193875968456, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 9.67680549621582, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 0.008319403044879436, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 9.649229049682617, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 0.010648003779351711, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 9.525718688964844, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 0.03953470662236214, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 9.56295394897461, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 0.04883531853556633, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 16.108335494995117, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 0.00560220330953598, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 23.730453491210938, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 0.09107042849063873, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 10.667997360229492, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 0.03963166102766991, "pnorm/_forward_module._fsdp_wrapped_module.norm.weight": 16.294065475463867, "gnorm/_forward_module._fsdp_wrapped_module.norm.weight": 0.009401747025549412, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 141.58230590820312, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.10852779448032379} +{"step": 146800640, "pnorm/_forward_module._fsdp_wrapped_module.emb.weight": 3563.476318359375, "gnorm/_forward_module._fsdp_wrapped_module.emb.weight": 0.006061962340027094, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 15.940529823303223, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 0.005118690896779299, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 9.924413681030273, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 0.028911275789141655, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 9.866061210632324, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 0.026822563260793686, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 9.884973526000977, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 0.07900336384773254, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 9.82559585571289, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 0.06503617763519287, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 16.17411994934082, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 0.005857010837644339, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 24.600141525268555, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 0.05409305542707443, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 11.299821853637695, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 0.056708626449108124, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 15.955729484558105, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 0.0017172022489830852, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 9.69444751739502, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 0.006286215968430042, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 9.668268203735352, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 0.005678760819137096, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 9.53417682647705, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 0.01958923041820526, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 9.569130897521973, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 0.032209787517786026, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 16.0965576171875, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 0.003821730148047209, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 23.71497917175293, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 0.0737115740776062, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 10.671407699584961, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 0.03165173903107643, "pnorm/_forward_module._fsdp_wrapped_module.norm.weight": 16.388713836669922, "gnorm/_forward_module._fsdp_wrapped_module.norm.weight": 0.012096140533685684, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 143.58633422851562, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.06509974598884583} +{"step": 167772160, "pnorm/_forward_module._fsdp_wrapped_module.emb.weight": 3559.96435546875, "gnorm/_forward_module._fsdp_wrapped_module.emb.weight": 0.006260845810174942, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 15.945326805114746, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 0.006318557076156139, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 10.024003982543945, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 0.056847162544727325, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 9.948001861572266, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 0.06507287174463272, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 9.948936462402344, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 0.08082656562328339, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 9.860431671142578, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 0.06193951889872551, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 16.186464309692383, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 0.004078098572790623, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 24.748441696166992, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 0.04538853093981743, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 11.471303939819336, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 0.04761291667819023, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 15.95386028289795, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 0.0015460415743291378, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 9.705833435058594, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 0.007724580820649862, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 9.683286666870117, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 0.007313838694244623, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 9.538752555847168, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 0.016601337119936943, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 9.570796966552734, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 0.021054143086075783, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 16.090078353881836, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 0.0020610385108739138, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 23.6977596282959, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 0.04924076795578003, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 10.661310195922852, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 0.022083593532443047, "pnorm/_forward_module._fsdp_wrapped_module.norm.weight": 16.497190475463867, "gnorm/_forward_module._fsdp_wrapped_module.norm.weight": 0.015279398299753666, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 145.81134033203125, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.05961906164884567} +{"step": 188743680, "pnorm/_forward_module._fsdp_wrapped_module.emb.weight": 3556.463134765625, "gnorm/_forward_module._fsdp_wrapped_module.emb.weight": 0.009356333874166012, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 15.952763557434082, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 0.008578632958233356, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 10.120271682739258, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 0.0566399022936821, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 10.034550666809082, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 0.05668582767248154, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 10.003116607666016, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 0.1469486504793167, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 9.886467933654785, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 0.13319778442382812, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 16.208526611328125, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 0.008025318384170532, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 24.910680770874023, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 0.09857229143381119, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 11.639714241027832, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 0.10091014206409454, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 15.952399253845215, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 0.004428979940712452, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 9.718561172485352, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 0.02699918858706951, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 9.698831558227539, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 0.0223908182233572, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 9.541362762451172, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 0.061275362968444824, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 9.57063102722168, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 0.08072564750909805, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 16.087299346923828, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 0.009873921051621437, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 23.684595108032227, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 0.12737704813480377, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 10.655739784240723, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 0.055429644882678986, "pnorm/_forward_module._fsdp_wrapped_module.norm.weight": 16.612722396850586, "gnorm/_forward_module._fsdp_wrapped_module.norm.weight": 0.01799042709171772, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 148.21759033203125, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.0969938412308693} +{"step": 209715200, "pnorm/_forward_module._fsdp_wrapped_module.emb.weight": 3552.9892578125, "gnorm/_forward_module._fsdp_wrapped_module.emb.weight": 0.005517409183084965, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 15.961624145507812, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 0.005108471028506756, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 10.209575653076172, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 0.037277281284332275, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 10.116769790649414, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 0.03792709857225418, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 10.044777870178223, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 0.07170242816209793, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 9.905014038085938, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 0.06454399228096008, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 16.23490333557129, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 0.0045053958892822266, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 25.07438087463379, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 0.05549774318933487, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 11.7957763671875, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 0.06415290385484695, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 15.952661514282227, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 0.0022916521411389112, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 9.734306335449219, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 0.009657173417508602, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 9.715261459350586, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 0.009367027319967747, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 9.54203987121582, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 0.01878405548632145, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 9.569131851196289, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 0.02807873673737049, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 16.083677291870117, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 0.0031704106368124485, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 23.668264389038086, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 0.0661737248301506, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 10.64652156829834, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 0.03106365166604519, "pnorm/_forward_module._fsdp_wrapped_module.norm.weight": 16.73287582397461, "gnorm/_forward_module._fsdp_wrapped_module.norm.weight": 0.01654941402375698, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 150.7775421142578, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.08438514918088913} +{"step": 230686720, "pnorm/_forward_module._fsdp_wrapped_module.emb.weight": 3549.549072265625, "gnorm/_forward_module._fsdp_wrapped_module.emb.weight": 0.015393799170851707, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 15.970928192138672, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 0.01483925711363554, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 10.293045043945312, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 0.058916185051202774, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 10.196504592895508, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 0.05913504585623741, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 10.081242561340332, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 0.24541060626506805, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 9.921795845031738, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 0.28118374943733215, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 16.264835357666016, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 0.029023688286542892, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 25.243160247802734, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 0.2503472864627838, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 11.940641403198242, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 0.3014214336872101, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 15.95241928100586, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 0.011413434520363808, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 9.749923706054688, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 0.037763480097055435, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 9.731120109558105, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 0.040122274309396744, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 9.542871475219727, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 0.07394985109567642, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 9.567291259765625, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 0.13417086005210876, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 16.077945709228516, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 0.024245690554380417, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 23.652244567871094, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 0.4347669780254364, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 10.630656242370605, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 0.21151286363601685, "pnorm/_forward_module._fsdp_wrapped_module.norm.weight": 16.849756240844727, "gnorm/_forward_module._fsdp_wrapped_module.norm.weight": 0.04472949355840683, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 153.44552612304688, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.23296867311000824} +{"step": 251658240, "pnorm/_forward_module._fsdp_wrapped_module.emb.weight": 3546.150146484375, "gnorm/_forward_module._fsdp_wrapped_module.emb.weight": 0.008623725734651089, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 15.978719711303711, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 0.008114367723464966, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 10.365741729736328, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 0.05241962894797325, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 10.265873908996582, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 0.05220724642276764, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 10.111873626708984, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 0.12292210757732391, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 9.936120986938477, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 0.0959014743566513, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 16.28751564025879, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 0.006024819333106279, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 25.384817123413086, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 0.07642516493797302, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 12.046393394470215, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 0.08807393908500671, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 15.951213836669922, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 0.0014182371087372303, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 9.755594253540039, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 0.010309918783605099, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 9.739227294921875, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 0.00860229879617691, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 9.544881820678711, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 0.018855474889278412, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 9.56664752960205, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 0.034539442509412766, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 16.068605422973633, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 0.004135791677981615, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 23.63324737548828, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 0.10622759163379669, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 10.588372230529785, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 0.04882499575614929, "pnorm/_forward_module._fsdp_wrapped_module.norm.weight": 16.95037269592285, "gnorm/_forward_module._fsdp_wrapped_module.norm.weight": 0.01106222253292799, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 156.0207061767578, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.07086677104234695} +{"step": 272629760, "pnorm/_forward_module._fsdp_wrapped_module.emb.weight": 3542.779052734375, "gnorm/_forward_module._fsdp_wrapped_module.emb.weight": 0.006810943130403757, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 15.985916137695312, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 0.006454857997596264, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 10.43223762512207, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 0.056223586201667786, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 10.330218315124512, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 0.058171551674604416, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 10.137276649475098, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 0.09257037192583084, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 9.948196411132812, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 0.0747305229306221, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 16.310590744018555, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 0.0044560860842466354, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 25.52574348449707, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 0.05368860438466072, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 12.151346206665039, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 0.052852410823106766, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 15.952607154846191, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 0.0015900008147582412, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 9.769059181213379, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 0.009543420746922493, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 9.753323554992676, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 0.009224224835634232, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 9.546079635620117, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 0.020635923370718956, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 9.565682411193848, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 0.023962033912539482, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 16.064783096313477, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 0.002602787222713232, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 23.621267318725586, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 0.040426105260849, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 10.57606029510498, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 0.024434693157672882, "pnorm/_forward_module._fsdp_wrapped_module.norm.weight": 17.05779266357422, "gnorm/_forward_module._fsdp_wrapped_module.norm.weight": 0.011935953982174397, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 158.59104919433594, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.054098233580589294} +{"step": 293601280, "pnorm/_forward_module._fsdp_wrapped_module.emb.weight": 3539.430419921875, "gnorm/_forward_module._fsdp_wrapped_module.emb.weight": 0.0067461105063557625, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 15.99271297454834, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 0.006930368021130562, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 10.495278358459473, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 0.05009077489376068, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 10.392062187194824, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 0.04912539944052696, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 10.159712791442871, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 0.09602733701467514, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 9.96003532409668, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 0.09553088992834091, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 16.339820861816406, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 0.008527282625436783, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 25.683969497680664, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 0.08543337881565094, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 12.266982078552246, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 0.0956072136759758, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 15.956061363220215, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 0.003491821000352502, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 9.79068660736084, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 0.011623155325651169, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 9.773252487182617, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 0.014404095709323883, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 9.548477172851562, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 0.02556944452226162, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 9.56507396697998, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 0.03831855207681656, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 16.064706802368164, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 0.006154166534543037, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 23.618907928466797, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 0.11255592107772827, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 10.582945823669434, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 0.05572429671883583, "pnorm/_forward_module._fsdp_wrapped_module.norm.weight": 17.171512603759766, "gnorm/_forward_module._fsdp_wrapped_module.norm.weight": 0.009613639675080776, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 161.21157836914062, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.080792136490345} +{"step": 314572800, "pnorm/_forward_module._fsdp_wrapped_module.emb.weight": 3536.106689453125, "gnorm/_forward_module._fsdp_wrapped_module.emb.weight": 0.00998227670788765, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 15.99724292755127, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 0.009438646957278252, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 10.546557426452637, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 0.07580174505710602, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 10.441717147827148, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 0.07526622712612152, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 10.180169105529785, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 0.13597224652767181, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 9.971176147460938, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 0.11970951408147812, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 16.370738983154297, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 0.008082053624093533, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 25.83755874633789, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 0.08426141738891602, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 12.376396179199219, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 0.08177628368139267, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 15.959354400634766, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 0.0030325462576001883, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 9.811175346374512, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 0.022909684106707573, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 9.791274070739746, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 0.024072881788015366, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 9.551311492919922, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 0.03606449067592621, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 9.564996719360352, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 0.04307352378964424, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 16.065168380737305, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 0.006133991293609142, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 23.620058059692383, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 0.08665794134140015, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 10.589972496032715, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 0.041698623448610306, "pnorm/_forward_module._fsdp_wrapped_module.norm.weight": 17.28446388244629, "gnorm/_forward_module._fsdp_wrapped_module.norm.weight": 0.014112215489149094, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 163.81600952148438, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.050964612513780594} +{"step": 335544320, "pnorm/_forward_module._fsdp_wrapped_module.emb.weight": 3532.810302734375, "gnorm/_forward_module._fsdp_wrapped_module.emb.weight": 0.010383225046098232, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 16.00287437438965, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 0.009851436130702496, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 10.595487594604492, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 0.049598369747400284, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 10.489614486694336, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 0.050423573702573776, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 10.196860313415527, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 0.14222148060798645, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 9.980874061584473, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 0.10374514758586884, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 16.402118682861328, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 0.0056731244549155235, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 25.97983169555664, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 0.07101760804653168, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 12.473599433898926, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 0.0774674341082573, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 15.962261199951172, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 0.002930881455540657, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 9.83083724975586, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 0.018615057691931725, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 9.80844783782959, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 0.023790104314684868, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 9.556373596191406, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 0.03148454427719116, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 9.566485404968262, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 0.04407019540667534, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 16.066362380981445, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 0.006298020947724581, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 23.626819610595703, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 0.0721927136182785, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 10.598799705505371, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 0.03540477156639099, "pnorm/_forward_module._fsdp_wrapped_module.norm.weight": 17.39626693725586, "gnorm/_forward_module._fsdp_wrapped_module.norm.weight": 0.013849002309143543, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 166.37249755859375, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.06951813399791718} +{"step": 356515840, "pnorm/_forward_module._fsdp_wrapped_module.emb.weight": 3529.550048828125, "gnorm/_forward_module._fsdp_wrapped_module.emb.weight": 0.006449004169553518, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 16.009082794189453, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 0.006358795333653688, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 10.638776779174805, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 0.06430598348379135, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 10.533318519592285, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 0.06467295438051224, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 10.209417343139648, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 0.10155041515827179, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 9.9887056350708, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 0.08735688775777817, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 16.428098678588867, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 0.004925544373691082, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 26.09998321533203, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 0.06405705958604813, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 12.548104286193848, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 0.07208545506000519, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 15.964852333068848, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 0.003515151096507907, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 9.846671104431152, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 0.021845003589987755, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 9.820448875427246, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 0.022327151149511337, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 9.56163501739502, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 0.02232026867568493, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 9.5690279006958, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 0.029024718329310417, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 16.065549850463867, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 0.0034142276272177696, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 23.631006240844727, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 0.07062450051307678, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 10.58882999420166, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 0.035063762217760086, "pnorm/_forward_module._fsdp_wrapped_module.norm.weight": 17.49505043029785, "gnorm/_forward_module._fsdp_wrapped_module.norm.weight": 0.01022188551723957, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 168.8275909423828, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.06804605573415756} +{"step": 377487360, "pnorm/_forward_module._fsdp_wrapped_module.emb.weight": 3526.31103515625, "gnorm/_forward_module._fsdp_wrapped_module.emb.weight": 0.008376551792025566, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 16.015605926513672, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 0.00685038510710001, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 10.680133819580078, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 0.061756376177072525, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 10.575413703918457, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 0.05978317931294441, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 10.219904899597168, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 0.10778054594993591, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 9.99500560760498, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 0.0850122943520546, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 16.45624542236328, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 0.0034070617984980345, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 26.218107223510742, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 0.05672307312488556, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 12.62120246887207, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 0.06293430924415588, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 15.967729568481445, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 0.0021362104453146458, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 9.862878799438477, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 0.012334390543401241, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 9.83313274383545, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 0.015433733351528645, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 9.567503929138184, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 0.021872740238904953, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 9.571617126464844, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 0.023786142468452454, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 16.069517135620117, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 0.002180920448154211, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 23.64315414428711, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 0.03672719746828079, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 10.600384712219238, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 0.020253358408808708, "pnorm/_forward_module._fsdp_wrapped_module.norm.weight": 17.60161018371582, "gnorm/_forward_module._fsdp_wrapped_module.norm.weight": 0.010297439992427826, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 171.2637481689453, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.057645056396722794} +{"step": 398458880, "pnorm/_forward_module._fsdp_wrapped_module.emb.weight": 3523.0947265625, "gnorm/_forward_module._fsdp_wrapped_module.emb.weight": 0.012320803478360176, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 16.022197723388672, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 0.010783156380057335, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 10.720157623291016, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 0.04268094524741173, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 10.616250038146973, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 0.04153281822800636, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 10.228541374206543, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 0.1790286898612976, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 10.000029563903809, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 0.16519802808761597, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 16.489742279052734, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 0.008127656765282154, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 26.34427833557129, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 0.14502328634262085, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 12.699405670166016, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 0.19157899916172028, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 15.971162796020508, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 0.00792423915117979, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 9.881932258605957, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 0.05929902568459511, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 9.848990440368652, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 0.07040618360042572, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 9.573899269104004, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 0.11946281790733337, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 9.575224876403809, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 0.13805395364761353, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 16.075267791748047, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 0.014137685298919678, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 23.663034439086914, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 0.21739234030246735, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 10.62258243560791, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 0.11229242384433746, "pnorm/_forward_module._fsdp_wrapped_module.norm.weight": 17.713233947753906, "gnorm/_forward_module._fsdp_wrapped_module.norm.weight": 0.009393228217959404, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 173.68246459960938, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.09945356100797653} +{"step": 419430400, "pnorm/_forward_module._fsdp_wrapped_module.emb.weight": 3519.9111328125, "gnorm/_forward_module._fsdp_wrapped_module.emb.weight": 0.006900247652083635, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 16.028263092041016, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 0.006528428755700588, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 10.753490447998047, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 0.033202867954969406, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 10.651779174804688, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 0.0337848924100399, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 10.236234664916992, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 0.10498291999101639, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 10.004316329956055, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 0.09000696986913681, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 16.520431518554688, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 0.0047908965498209, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 26.455198287963867, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 0.06348186731338501, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 12.765274047851562, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 0.06820078939199448, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 15.973631858825684, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 0.003005844773724675, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 9.89500904083252, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 0.015421578660607338, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 9.86083984375, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 0.024887975305318832, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 9.58124828338623, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 0.03325808420777321, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 9.579325675964355, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 0.03332490846514702, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 16.078943252563477, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 0.0034089991822838783, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 23.681833267211914, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 0.05462098866701126, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 10.63763427734375, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 0.029608532786369324, "pnorm/_forward_module._fsdp_wrapped_module.norm.weight": 17.822803497314453, "gnorm/_forward_module._fsdp_wrapped_module.norm.weight": 0.01203112117946148, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 176.03466796875, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.05739642679691315} +{"step": 440401920, "pnorm/_forward_module._fsdp_wrapped_module.emb.weight": 3516.75732421875, "gnorm/_forward_module._fsdp_wrapped_module.emb.weight": 0.014860875904560089, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 16.033897399902344, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 0.014724752865731716, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 10.783660888671875, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 0.1345289647579193, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 10.684296607971191, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 0.14576861262321472, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 10.24325942993164, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 0.19424819946289062, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 10.008612632751465, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 0.169420525431633, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 16.552963256835938, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 0.010193954221904278, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 26.566926956176758, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 0.12810245156288147, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 12.831064224243164, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 0.15586574375629425, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 15.977568626403809, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 0.00958973914384842, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 9.911520004272461, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 0.031366683542728424, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 9.876228332519531, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 0.04168199375271797, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 9.588544845581055, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 0.08051314949989319, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 9.58407211303711, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 0.07460296899080276, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 16.084688186645508, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 0.008896107785403728, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 23.708057403564453, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 0.1422508805990219, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 10.660240173339844, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 0.08409973978996277, "pnorm/_forward_module._fsdp_wrapped_module.norm.weight": 17.932939529418945, "gnorm/_forward_module._fsdp_wrapped_module.norm.weight": 0.008945719338953495, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 178.31907653808594, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.10139400511980057} +{"step": 461373440, "pnorm/_forward_module._fsdp_wrapped_module.emb.weight": 3513.642578125, "gnorm/_forward_module._fsdp_wrapped_module.emb.weight": 0.005986005533486605, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 16.039331436157227, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 0.005813267081975937, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 10.811623573303223, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 0.03003782592713833, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 10.713407516479492, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 0.028821822255849838, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 10.249053955078125, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 0.09753496944904327, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 10.011401176452637, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 0.1043923869729042, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 16.580034255981445, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 0.007231233175843954, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 26.660879135131836, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 0.09331657737493515, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 12.883437156677246, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 0.106775663793087, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 15.981253623962402, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 0.0031827217899262905, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 9.92441177368164, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 0.016716230660676956, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 9.88707160949707, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 0.023682033643126488, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 9.595473289489746, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 0.035490747541189194, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 9.588563919067383, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 0.04266831651329994, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 16.089279174804688, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 0.003961221780627966, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 23.733970642089844, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 0.09139540791511536, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 10.671441078186035, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 0.04906618222594261, "pnorm/_forward_module._fsdp_wrapped_module.norm.weight": 18.033342361450195, "gnorm/_forward_module._fsdp_wrapped_module.norm.weight": 0.007613579276949167, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 180.51962280273438, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.07166241854429245} +{"step": 482344960, "pnorm/_forward_module._fsdp_wrapped_module.emb.weight": 3510.5576171875, "gnorm/_forward_module._fsdp_wrapped_module.emb.weight": 0.009349091909825802, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 16.045589447021484, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 0.009005295112729073, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 10.839374542236328, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 0.04485252872109413, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 10.743867874145508, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 0.047823529690504074, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 10.252814292907715, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 0.13735535740852356, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 10.013565063476562, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 0.132121741771698, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 16.611404418945312, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 0.007462042849510908, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 26.76123046875, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 0.0866238996386528, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 12.940073013305664, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 0.10997512191534042, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 15.985601425170898, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 0.0049051083624362946, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 9.939993858337402, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 0.03215721249580383, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 9.900948524475098, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 0.04246847331523895, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 9.602681159973145, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 0.08763491362333298, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 9.593291282653809, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 0.08204063028097153, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 16.098236083984375, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 0.006830667145550251, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 23.768381118774414, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 0.07292734086513519, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 10.70206069946289, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 0.03368354216217995, "pnorm/_forward_module._fsdp_wrapped_module.norm.weight": 18.14104461669922, "gnorm/_forward_module._fsdp_wrapped_module.norm.weight": 0.008511473424732685, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 182.67254638671875, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.06344812363386154} +{"step": 503316480, "pnorm/_forward_module._fsdp_wrapped_module.emb.weight": 3507.510009765625, "gnorm/_forward_module._fsdp_wrapped_module.emb.weight": 0.009860014542937279, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 16.051645278930664, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 0.009601757861673832, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 10.864950180053711, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 0.0650637224316597, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 10.771415710449219, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 0.07553726434707642, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 10.256502151489258, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 0.14146429300308228, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 10.015015602111816, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 0.10902765393257141, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 16.642654418945312, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 0.005732554476708174, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 26.858619689941406, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 0.08250138908624649, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 12.993831634521484, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 0.09473224729299545, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 15.989933013916016, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 0.00324709783308208, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 9.955121994018555, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 0.013759848661720753, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 9.915315628051758, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 0.021037284284830093, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 9.609374046325684, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 0.0417049266397953, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 9.597317695617676, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 0.04473457857966423, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 16.10674285888672, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 0.003440829925239086, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 23.802764892578125, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 0.07124648243188858, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 10.731082916259766, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 0.044605888426303864, "pnorm/_forward_module._fsdp_wrapped_module.norm.weight": 18.248428344726562, "gnorm/_forward_module._fsdp_wrapped_module.norm.weight": 0.01294850092381239, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 184.7506103515625, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.055864859372377396} +{"step": 524288000, "pnorm/_forward_module._fsdp_wrapped_module.emb.weight": 3504.5, "gnorm/_forward_module._fsdp_wrapped_module.emb.weight": 0.011584457010030746, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 16.05695343017578, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 0.010538598522543907, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 10.887200355529785, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 0.04816485196352005, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 10.794876098632812, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 0.048934973776340485, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 10.259103775024414, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 0.16382427513599396, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 10.016145706176758, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 0.13447386026382446, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 16.6713809967041, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 0.008156409487128258, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 26.94510269165039, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 0.12118273973464966, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 13.04128646850586, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 0.14334654808044434, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 15.99407958984375, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 0.004177269525825977, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 9.97097396850586, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 0.015244467183947563, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 9.929752349853516, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 0.01986868679523468, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 9.615089416503906, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 0.06861905008554459, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 9.601229667663574, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 0.07253916561603546, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 16.116722106933594, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 0.005802445579320192, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 23.84272003173828, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 0.1302775740623474, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 10.767929077148438, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 0.0777064636349678, "pnorm/_forward_module._fsdp_wrapped_module.norm.weight": 18.35611915588379, "gnorm/_forward_module._fsdp_wrapped_module.norm.weight": 0.007146485149860382, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 186.7506866455078, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.07103117555379868} +{"step": 545259520, "pnorm/_forward_module._fsdp_wrapped_module.emb.weight": 3501.535400390625, "gnorm/_forward_module._fsdp_wrapped_module.emb.weight": 0.01088086050003767, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 16.0617733001709, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 0.010662532411515713, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 10.907999038696289, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 0.05234783887863159, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 10.817185401916504, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 0.058535605669021606, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 10.261126518249512, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 0.18437452614307404, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 10.016796112060547, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 0.17337138950824738, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 16.696964263916016, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 0.007574084680527449, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 27.02156639099121, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 0.12501809000968933, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 13.080544471740723, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 0.16145898401737213, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 15.996916770935059, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 0.010634000413119793, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 9.983139038085938, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 0.04254318028688431, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 9.941365242004395, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 0.057610027492046356, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 9.619441032409668, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 0.12217170745134354, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 9.6041898727417, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 0.09920046478509903, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 16.11931610107422, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 0.008044460788369179, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 23.86786651611328, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 0.1199655756354332, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 10.784780502319336, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 0.061495453119277954, "pnorm/_forward_module._fsdp_wrapped_module.norm.weight": 18.452680587768555, "gnorm/_forward_module._fsdp_wrapped_module.norm.weight": 0.009781631641089916, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 188.63009643554688, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.1132463663816452} +{"step": 566231040, "pnorm/_forward_module._fsdp_wrapped_module.emb.weight": 3498.610107421875, "gnorm/_forward_module._fsdp_wrapped_module.emb.weight": 0.008268188685178757, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 16.06780433654785, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 0.00796574354171753, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 10.929824829101562, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 0.05369338393211365, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 10.841042518615723, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 0.06068415194749832, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 10.26241683959961, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 0.11754916608333588, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 10.01589584350586, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 0.0964600071310997, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 16.719045639038086, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 0.004722523503005505, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 27.085973739624023, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 0.06330888718366623, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 13.112924575805664, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 0.06937503069639206, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 15.999931335449219, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 0.0022689085453748703, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 9.994046211242676, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 0.013969467021524906, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 9.95341682434082, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 0.018514813855290413, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 9.623188972473145, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 0.030679209157824516, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 9.607577323913574, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 0.029809847474098206, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 16.126338958740234, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 0.001946109696291387, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 23.898616790771484, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 0.040927838534116745, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 10.811441421508789, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 0.02800334058701992, "pnorm/_forward_module._fsdp_wrapped_module.norm.weight": 18.549606323242188, "gnorm/_forward_module._fsdp_wrapped_module.norm.weight": 0.010253007523715496, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 190.42813110351562, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.04839291796088219} +{"step": 587202560, "pnorm/_forward_module._fsdp_wrapped_module.emb.weight": 3495.721435546875, "gnorm/_forward_module._fsdp_wrapped_module.emb.weight": 0.006824243813753128, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 16.073415756225586, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 0.005982102360576391, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 10.95025634765625, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 0.03066674992442131, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 10.86420726776123, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 0.03259284794330597, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 10.263056755065918, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 0.11078756302595139, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 10.014532089233398, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 0.11092313379049301, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 16.743345260620117, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 0.006965207401663065, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 27.151933670043945, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 0.10050996392965317, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 13.147433280944824, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 0.12073390930891037, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 16.00243377685547, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 0.003500401508063078, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 10.00330638885498, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 0.01837201789021492, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 9.96288776397705, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 0.026629360392689705, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 9.625710487365723, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 0.054533008486032486, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 9.609986305236816, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 0.05338519066572189, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 16.13837242126465, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 0.003635460278019309, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 23.94397735595703, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 0.09102194756269455, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 10.852316856384277, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 0.06490911543369293, "pnorm/_forward_module._fsdp_wrapped_module.norm.weight": 18.6531982421875, "gnorm/_forward_module._fsdp_wrapped_module.norm.weight": 0.015472713857889175, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 192.1757354736328, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.06963207572698593} +{"step": 608174080, "pnorm/_forward_module._fsdp_wrapped_module.emb.weight": 3492.87109375, "gnorm/_forward_module._fsdp_wrapped_module.emb.weight": 0.01134779304265976, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 16.08043098449707, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 0.010272403247654438, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 10.974067687988281, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 0.08270470798015594, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 10.891220092773438, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 0.09547960758209229, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 10.263498306274414, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 0.15134266018867493, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 10.012812614440918, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 0.12580737471580505, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 16.769149780273438, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 0.005015640053898096, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 27.219274520874023, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 0.08203869313001633, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 13.182572364807129, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 0.09186311066150665, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 16.00611114501953, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 0.00354568543843925, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 10.016125679016113, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 0.013097682036459446, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 9.97469425201416, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 0.022982480004429817, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 9.628756523132324, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 0.05549878999590874, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 9.612714767456055, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 0.050409648567438126, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 16.1523380279541, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 0.00289122574031353, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 23.996702194213867, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 0.05795261636376381, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 10.89953327178955, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 0.035516779869794846, "pnorm/_forward_module._fsdp_wrapped_module.norm.weight": 18.759187698364258, "gnorm/_forward_module._fsdp_wrapped_module.norm.weight": 0.01004074327647686, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 193.84542846679688, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.04863952845335007} +{"step": 629145600, "pnorm/_forward_module._fsdp_wrapped_module.emb.weight": 3490.06884765625, "gnorm/_forward_module._fsdp_wrapped_module.emb.weight": 0.0110209621489048, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 16.086074829101562, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 0.009714844636619091, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 10.992527961730957, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 0.051458511501550674, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 10.911864280700684, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 0.05642317607998848, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 10.263442039489746, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 0.1655798852443695, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 10.01142406463623, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 0.16949111223220825, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 16.79148292541504, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 0.007532638031989336, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 27.27861213684082, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 0.11875041574239731, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 13.212430953979492, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 0.16005371510982513, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 16.008310317993164, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 0.007561093661934137, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 10.025227546691895, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 0.03295249864459038, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 9.983246803283691, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 0.05349228158593178, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 9.631285667419434, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 0.11149884015321732, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 9.615468978881836, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 0.09316114336252213, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 16.162776947021484, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 0.005419726483523846, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 24.042984008789062, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 0.09950411319732666, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 10.931429862976074, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 0.05564573034644127, "pnorm/_forward_module._fsdp_wrapped_module.norm.weight": 18.855037689208984, "gnorm/_forward_module._fsdp_wrapped_module.norm.weight": 0.013472238555550575, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 195.42393493652344, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.06794838607311249} +{"step": 650117120, "pnorm/_forward_module._fsdp_wrapped_module.emb.weight": 3487.30859375, "gnorm/_forward_module._fsdp_wrapped_module.emb.weight": 0.010248369537293911, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 16.092941284179688, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 0.010405913926661015, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 11.014558792114258, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 0.03709695488214493, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 10.936502456665039, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 0.04023831710219383, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 10.262482643127441, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 0.15186794102191925, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 10.008602142333984, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 0.12211384624242783, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 16.811729431152344, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 0.005799711681902409, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 27.330904006958008, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 0.07860928028821945, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 13.238473892211914, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 0.0899227112531662, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 16.011154174804688, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 0.0035886375699192286, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 10.036911964416504, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 0.01473289541900158, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 9.993033409118652, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 0.02213270589709282, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 9.633602142333984, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 0.051735635846853256, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 9.618467330932617, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 0.043279748409986496, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 16.17660140991211, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 0.002598551334813237, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 24.092681884765625, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 0.04985353350639343, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 10.968523979187012, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 0.032373782247304916, "pnorm/_forward_module._fsdp_wrapped_module.norm.weight": 18.948741912841797, "gnorm/_forward_module._fsdp_wrapped_module.norm.weight": 0.00925598107278347, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 196.92054748535156, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.05280328541994095} +{"step": 671088640, "pnorm/_forward_module._fsdp_wrapped_module.emb.weight": 3484.587646484375, "gnorm/_forward_module._fsdp_wrapped_module.emb.weight": 0.00710269995033741, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 16.100419998168945, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 0.006495304871350527, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 11.037111282348633, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 0.028430450707674026, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 10.961181640625, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 0.026297125965356827, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 10.261137008666992, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 0.10749237984418869, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 10.005535125732422, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 0.09852709621191025, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 16.833707809448242, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 0.004514497704803944, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 27.385122299194336, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 0.07108400017023087, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 13.26506233215332, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 0.09228689223527908, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 16.0136661529541, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 0.0041005234234035015, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 10.047477722167969, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 0.016266915947198868, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 10.00145435333252, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 0.03371940180659294, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 9.634993553161621, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 0.05925728753209114, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 9.620899200439453, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 0.046969275921583176, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 16.194303512573242, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 0.0024821211118251085, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 24.150135040283203, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 0.05345563217997551, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 11.016013145446777, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 0.04824370518326759, "pnorm/_forward_module._fsdp_wrapped_module.norm.weight": 19.047000885009766, "gnorm/_forward_module._fsdp_wrapped_module.norm.weight": 0.007254213094711304, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 198.3530731201172, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.042954590171575546} +{"step": 692060160, "pnorm/_forward_module._fsdp_wrapped_module.emb.weight": 3481.91748046875, "gnorm/_forward_module._fsdp_wrapped_module.emb.weight": 0.01229430828243494, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 16.10643196105957, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 0.011233395896852016, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 11.05509090423584, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 0.0475471206009388, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 10.98171329498291, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 0.05270359665155411, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 10.26024055480957, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 0.18830212950706482, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 10.002433776855469, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 0.18870775401592255, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 16.85501480102539, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 0.008288148790597916, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 27.43765640258789, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 0.1455639749765396, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 13.29102897644043, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 0.20302024483680725, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 16.01458740234375, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 0.009744890034198761, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 10.052664756774902, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 0.022143274545669556, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 10.005840301513672, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 0.04313281923532486, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 9.63632869720459, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 0.1702807992696762, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 9.623296737670898, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 0.1243516132235527, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 16.21148109436035, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 0.0073399669490754604, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 24.209247589111328, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 0.09348942339420319, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 11.063151359558105, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 0.06419037282466888, "pnorm/_forward_module._fsdp_wrapped_module.norm.weight": 19.144695281982422, "gnorm/_forward_module._fsdp_wrapped_module.norm.weight": 0.010935906320810318, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 199.71063232421875, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.05486848205327988} +{"step": 713031680, "pnorm/_forward_module._fsdp_wrapped_module.emb.weight": 3479.295654296875, "gnorm/_forward_module._fsdp_wrapped_module.emb.weight": 0.011333861388266087, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 16.112367630004883, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 0.010810934007167816, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 11.072932243347168, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 0.06839489936828613, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 11.00253963470459, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 0.07525856792926788, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 10.259024620056152, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 0.17656561732292175, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 9.999245643615723, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 0.15390154719352722, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 16.874290466308594, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 0.006153775379061699, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 27.48409652709961, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 0.09393097460269928, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 13.313392639160156, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 0.13136157393455505, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 16.016386032104492, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 0.006471457425504923, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 10.06063461303711, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 0.023505549877882004, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 10.011672019958496, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 0.03953205794095993, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 9.63765811920166, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 0.10783282667398453, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 9.625515937805176, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 0.07859349995851517, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 16.22636604309082, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 0.0043866937048733234, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 24.26387596130371, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 0.0649585947394371, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 11.107158660888672, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 0.045726705342531204, "pnorm/_forward_module._fsdp_wrapped_module.norm.weight": 19.238733291625977, "gnorm/_forward_module._fsdp_wrapped_module.norm.weight": 0.009742051362991333, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 200.9904327392578, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.06209244206547737} +{"step": 734003200, "pnorm/_forward_module._fsdp_wrapped_module.emb.weight": 3476.71826171875, "gnorm/_forward_module._fsdp_wrapped_module.emb.weight": 0.010551116429269314, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 16.119075775146484, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 0.010508871637284756, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 11.092495918273926, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 0.05036025866866112, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 11.024299621582031, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 0.06035970151424408, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 10.257002830505371, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 0.15951181948184967, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 9.99548053741455, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 0.14235679805278778, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 16.894615173339844, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 0.0072867609560489655, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 27.531057357788086, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 0.10698563605546951, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 13.33617115020752, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 0.1470641791820526, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 16.017871856689453, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 0.006988075561821461, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 10.068193435668945, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 0.02783648483455181, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 10.016721725463867, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 0.04151628911495209, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 9.638070106506348, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 0.09945724904537201, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 9.627082824707031, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 0.07910915464162827, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 16.24494171142578, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 0.00437871553003788, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 24.32714080810547, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 0.0969095230102539, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 11.15707778930664, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 0.1028226688504219, "pnorm/_forward_module._fsdp_wrapped_module.norm.weight": 19.334815979003906, "gnorm/_forward_module._fsdp_wrapped_module.norm.weight": 0.01471006404608488, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 202.21133422851562, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.07453205436468124} +{"step": 754974720, "pnorm/_forward_module._fsdp_wrapped_module.emb.weight": 3474.18994140625, "gnorm/_forward_module._fsdp_wrapped_module.emb.weight": 0.014236009679734707, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 16.12537384033203, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 0.013269182294607162, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 11.109707832336426, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 0.06711570173501968, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 11.044132232666016, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 0.07462286949157715, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 10.254672050476074, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 0.22661076486110687, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 9.991384506225586, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 0.21807418763637543, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 16.913816452026367, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 0.009234326891601086, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 27.57392120361328, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 0.13502953946590424, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 13.357460975646973, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 0.17912721633911133, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 16.01923942565918, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 0.01003858633339405, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 10.074675559997559, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 0.02990064024925232, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 10.02098274230957, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 0.03862170875072479, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 9.638300895690918, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 0.177170068025589, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 9.628981590270996, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 0.11688543856143951, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 16.265731811523438, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 0.005123141221702099, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 24.395292282104492, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 0.07157164067029953, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 11.205028533935547, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 0.04464852437376976, "pnorm/_forward_module._fsdp_wrapped_module.norm.weight": 19.42731475830078, "gnorm/_forward_module._fsdp_wrapped_module.norm.weight": 0.008290709927678108, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 203.3717803955078, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.04855489730834961} +{"step": 775946240, "pnorm/_forward_module._fsdp_wrapped_module.emb.weight": 3471.712646484375, "gnorm/_forward_module._fsdp_wrapped_module.emb.weight": 0.013137677684426308, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 16.131690979003906, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 0.012129553593695164, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 11.127827644348145, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 0.041723378002643585, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 11.064044952392578, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 0.05256382003426552, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 10.252230644226074, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 0.19137869775295258, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 9.987451553344727, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 0.17791320383548737, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 16.9315185546875, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 0.007365366443991661, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 27.61275863647461, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 0.12654241919517517, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 13.376368522644043, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 0.17097792029380798, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 16.020328521728516, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 0.00762030528858304, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 10.079866409301758, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 0.02354402281343937, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 10.024528503417969, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 0.039584867656230927, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 9.638801574707031, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 0.12212255597114563, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 9.630866050720215, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 0.08605556935071945, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 16.285593032836914, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 0.0036185430362820625, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 24.461700439453125, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 0.08160258084535599, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 11.25490951538086, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 0.0837540552020073, "pnorm/_forward_module._fsdp_wrapped_module.norm.weight": 19.51915168762207, "gnorm/_forward_module._fsdp_wrapped_module.norm.weight": 0.006889111362397671, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 204.4741973876953, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.06076480448246002} +{"step": 796917760, "pnorm/_forward_module._fsdp_wrapped_module.emb.weight": 3469.284423828125, "gnorm/_forward_module._fsdp_wrapped_module.emb.weight": 0.009472460485994816, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 16.137557983398438, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 0.00809080433100462, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 11.143678665161133, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 0.04796949028968811, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 11.081170082092285, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 0.05117855966091156, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 10.250375747680664, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 0.1366322934627533, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 9.983504295349121, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 0.12508003413677216, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 16.94853401184082, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 0.004451759159564972, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 27.649799346923828, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 0.09160783141851425, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 13.394237518310547, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 0.11734291911125183, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 16.02153205871582, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 0.005741445813328028, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 10.083775520324707, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 0.021890174597501755, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 10.027414321899414, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 0.03737454116344452, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 9.638971328735352, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 0.09569304436445236, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 9.632159233093262, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 0.06771941483020782, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 16.303571701049805, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 0.003038219641894102, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 24.523113250732422, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 0.05080508440732956, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 11.300548553466797, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 0.05073569715023041, "pnorm/_forward_module._fsdp_wrapped_module.norm.weight": 19.605741500854492, "gnorm/_forward_module._fsdp_wrapped_module.norm.weight": 0.007896254770457745, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 205.51495361328125, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.048129383474588394} +{"step": 817889280, "pnorm/_forward_module._fsdp_wrapped_module.emb.weight": 3466.9052734375, "gnorm/_forward_module._fsdp_wrapped_module.emb.weight": 0.010477997362613678, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 16.14413070678711, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 0.008975685574114323, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 11.162308692932129, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 0.05692627280950546, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 11.101099967956543, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 0.06785067170858383, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 10.247771263122559, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 0.1560780107975006, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 9.979169845581055, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 0.13972634077072144, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 16.966007232666016, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 0.005621789488941431, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 27.68515968322754, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 0.09080605208873749, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 13.411235809326172, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 0.11398281157016754, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 16.022315979003906, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 0.004577907267957926, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 10.088001251220703, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 0.016852550208568573, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 10.029711723327637, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 0.02560087852180004, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 9.638289451599121, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 0.08991097658872604, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 9.632772445678711, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 0.07053885608911514, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 16.324724197387695, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 0.00332253728993237, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 24.58820343017578, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 0.06138895824551582, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 11.348165512084961, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 0.05607762932777405, "pnorm/_forward_module._fsdp_wrapped_module.norm.weight": 19.69219207763672, "gnorm/_forward_module._fsdp_wrapped_module.norm.weight": 0.007985266856849194, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 206.5021514892578, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.04745414853096008} +{"step": 838860800, "pnorm/_forward_module._fsdp_wrapped_module.emb.weight": 3464.578125, "gnorm/_forward_module._fsdp_wrapped_module.emb.weight": 0.009745168499648571, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 16.150012969970703, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 0.009405135177075863, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 11.178657531738281, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 0.034281112253665924, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 11.118605613708496, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 0.03559038043022156, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 10.24503231048584, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 0.13605326414108276, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 9.974791526794434, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 0.1295572966337204, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 16.98375129699707, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 0.00580357201397419, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 27.7202205657959, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 0.09447737783193588, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 13.427804946899414, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 0.125971257686615, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 16.0229434967041, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 0.0073095704428851604, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 10.090465545654297, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 0.02391328476369381, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 10.031389236450195, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 0.03591732308268547, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 9.637434005737305, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 0.10520732402801514, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 9.632691383361816, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 0.07277466356754303, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 16.348695755004883, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 0.003584987483918667, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 24.659624099731445, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 0.06326765567064285, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 11.398346900939941, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 0.06336524337530136, "pnorm/_forward_module._fsdp_wrapped_module.norm.weight": 19.778257369995117, "gnorm/_forward_module._fsdp_wrapped_module.norm.weight": 0.006438928656280041, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 207.4395294189453, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.05900737643241882} +{"step": 859832320, "pnorm/_forward_module._fsdp_wrapped_module.emb.weight": 3462.303955078125, "gnorm/_forward_module._fsdp_wrapped_module.emb.weight": 0.00973437074571848, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 16.155654907226562, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 0.008733526803553104, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 11.1937894821167, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 0.042660024017095566, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 11.134902954101562, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 0.046042196452617645, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 10.24195671081543, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 0.1592429280281067, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 9.970345497131348, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 0.15495388209819794, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 16.999963760375977, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 0.006841431837528944, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 27.751358032226562, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 0.1163521558046341, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 13.442468643188477, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 0.13765285909175873, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 16.023649215698242, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 0.006895693019032478, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 10.093382835388184, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 0.019112927839159966, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 10.033140182495117, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 0.03395240008831024, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 9.636387825012207, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 0.10302890837192535, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 9.63330078125, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 0.07552780956029892, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 16.372072219848633, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 0.0038869385607540607, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 24.729103088378906, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 0.06535886973142624, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 11.4467191696167, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 0.05853456258773804, "pnorm/_forward_module._fsdp_wrapped_module.norm.weight": 19.861160278320312, "gnorm/_forward_module._fsdp_wrapped_module.norm.weight": 0.006965252570807934, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 208.33009338378906, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.043907951563596725} +{"step": 880803840, "pnorm/_forward_module._fsdp_wrapped_module.emb.weight": 3460.080810546875, "gnorm/_forward_module._fsdp_wrapped_module.emb.weight": 0.008739771321415901, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 16.161460876464844, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 0.008174581453204155, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 11.209959983825684, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 0.034168850630521774, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 11.152289390563965, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 0.03577468916773796, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 10.238669395446777, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 0.14050258696079254, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 9.965974807739258, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 0.13819573819637299, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 17.01645278930664, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 0.006017552688717842, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 27.782394409179688, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 0.10223662108182907, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 13.456745147705078, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 0.13413643836975098, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 16.024465560913086, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 0.00567936385050416, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 10.095680236816406, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 0.02629472129046917, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 10.034330368041992, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 0.03577476739883423, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 9.635369300842285, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 0.10270971804857254, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 9.633647918701172, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 0.07480984181165695, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 16.3956356048584, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 0.0034074243158102036, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 24.798866271972656, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 0.06917855143547058, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 11.495715141296387, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 0.06853983551263809, "pnorm/_forward_module._fsdp_wrapped_module.norm.weight": 19.942100524902344, "gnorm/_forward_module._fsdp_wrapped_module.norm.weight": 0.0056783463805913925, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 209.17568969726562, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.05810907110571861} +{"step": 901775360, "pnorm/_forward_module._fsdp_wrapped_module.emb.weight": 3457.912353515625, "gnorm/_forward_module._fsdp_wrapped_module.emb.weight": 0.011576075106859207, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 16.16632843017578, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 0.010245062410831451, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 11.222514152526855, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 0.048622775822877884, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 11.166333198547363, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 0.05945894122123718, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 10.23589038848877, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 0.16139979660511017, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 9.961709976196289, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 0.14540114998817444, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 17.032804489135742, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 0.005625069607049227, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 27.81278419494629, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 0.10219571739435196, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 13.470643997192383, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 0.1458357721567154, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 16.025672912597656, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 0.006929649971425533, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 10.09984302520752, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 0.023396849632263184, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 10.036460876464844, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 0.03212655708193779, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 9.634306907653809, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 0.12014391273260117, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 9.633272171020508, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 0.08567894995212555, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 16.41937255859375, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 0.004364463035017252, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 24.869075775146484, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 0.06764717400074005, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 11.54609489440918, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 0.05740147456526756, "pnorm/_forward_module._fsdp_wrapped_module.norm.weight": 20.023094177246094, "gnorm/_forward_module._fsdp_wrapped_module.norm.weight": 0.007732026278972626, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 209.9747314453125, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.06927917897701263} +{"step": 922746880, "pnorm/_forward_module._fsdp_wrapped_module.emb.weight": 3455.799072265625, "gnorm/_forward_module._fsdp_wrapped_module.emb.weight": 0.007954214699566364, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 16.170652389526367, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 0.007494896650314331, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 11.23401927947998, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 0.031075676903128624, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 11.178760528564453, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 0.033950068056583405, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 10.232999801635742, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 0.11623227596282959, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 9.956942558288574, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 0.12257948517799377, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 17.049474716186523, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 0.004506805445998907, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 27.843027114868164, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 0.08066638559103012, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 13.484504699707031, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 0.10850896686315536, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 16.026517868041992, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 0.004977177828550339, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 10.10255241394043, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 0.01896381564438343, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 10.037945747375488, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 0.03190428763628006, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 9.632990837097168, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 0.08470471203327179, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 9.633151054382324, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 0.0578937828540802, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 16.443626403808594, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 0.0023837967310100794, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 24.940401077270508, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 0.04409945756196976, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 11.597190856933594, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 0.04834028333425522, "pnorm/_forward_module._fsdp_wrapped_module.norm.weight": 20.10333251953125, "gnorm/_forward_module._fsdp_wrapped_module.norm.weight": 0.007616851944476366, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 210.7401580810547, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.04518687352538109} +{"step": 943718400, "pnorm/_forward_module._fsdp_wrapped_module.emb.weight": 3453.738525390625, "gnorm/_forward_module._fsdp_wrapped_module.emb.weight": 0.013143765740096569, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 16.17555046081543, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 0.011317668482661247, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 11.248149871826172, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 0.06070645526051521, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 11.194046020507812, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 0.07612532377243042, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 10.229796409606934, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 0.19520451128482819, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 9.952564239501953, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 0.1670922040939331, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 17.06468391418457, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 0.006229756399989128, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 27.869443893432617, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 0.1172265112400055, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 13.496910095214844, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 0.15430936217308044, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 16.027446746826172, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 0.00873768050223589, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 10.105233192443848, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 0.021070171147584915, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 10.039212226867676, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 0.03407762572169304, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 9.631690979003906, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 0.12191398441791534, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 9.632952690124512, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 0.08232235163450241, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 16.4671688079834, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 0.003643771167844534, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 25.007919311523438, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 0.06162459775805473, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 11.643935203552246, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 0.05376451462507248, "pnorm/_forward_module._fsdp_wrapped_module.norm.weight": 20.179548263549805, "gnorm/_forward_module._fsdp_wrapped_module.norm.weight": 0.007747638039290905, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 211.46168518066406, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.058563269674777985} +{"step": 964689920, "pnorm/_forward_module._fsdp_wrapped_module.emb.weight": 3451.7314453125, "gnorm/_forward_module._fsdp_wrapped_module.emb.weight": 0.013531999662518501, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 16.179828643798828, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 0.011637738905847073, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 11.259446144104004, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 0.05135135352611542, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 11.206193923950195, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 0.05887332186102867, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 10.226577758789062, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 0.21851545572280884, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 9.948486328125, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 0.19276835024356842, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 17.079666137695312, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 0.007539447396993637, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 27.895559310913086, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 0.13533933460712433, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 13.508954048156738, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 0.1753295212984085, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 16.028013229370117, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 0.008333629928529263, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 10.10682487487793, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 0.02123827114701271, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 10.039450645446777, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 0.04111441969871521, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 9.630558967590332, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 0.14205843210220337, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 9.632458686828613, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 0.09373700618743896, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 16.49138641357422, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 0.003840101882815361, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 25.075117111206055, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 0.06478334963321686, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 11.691389083862305, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 0.06454896926879883, "pnorm/_forward_module._fsdp_wrapped_module.norm.weight": 20.254886627197266, "gnorm/_forward_module._fsdp_wrapped_module.norm.weight": 0.00841166079044342, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 212.15438842773438, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.04901603236794472} +{"step": 985661440, "pnorm/_forward_module._fsdp_wrapped_module.emb.weight": 3449.7783203125, "gnorm/_forward_module._fsdp_wrapped_module.emb.weight": 0.011793693527579308, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 16.184463500976562, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 0.010079510509967804, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 11.271367073059082, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 0.05672779679298401, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 11.219263076782227, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 0.07386919856071472, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 10.223590850830078, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 0.16813872754573822, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 9.943831443786621, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 0.15192517638206482, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 17.094114303588867, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 0.005454261787235737, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 27.920869827270508, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 0.10969018191099167, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 13.520398139953613, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 0.15987443923950195, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 16.029220581054688, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 0.006850095931440592, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 10.11059856414795, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 0.023143654689192772, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 10.041364669799805, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 0.0352495051920414, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 9.629014015197754, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 0.12806662917137146, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 9.631545066833496, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 0.09188339859247208, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 16.51547622680664, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 0.0038323705084621906, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 25.1420955657959, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 0.08595763891935349, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 11.737853050231934, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 0.10237438231706619, "pnorm/_forward_module._fsdp_wrapped_module.norm.weight": 20.329496383666992, "gnorm/_forward_module._fsdp_wrapped_module.norm.weight": 0.004728618543595076, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 212.81138610839844, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.061571259051561356} +{"step": 1006632960, "pnorm/_forward_module._fsdp_wrapped_module.emb.weight": 3447.880126953125, "gnorm/_forward_module._fsdp_wrapped_module.emb.weight": 0.011050297878682613, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 16.188888549804688, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 0.009854225441813469, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 11.283220291137695, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 0.031767748296260834, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 11.232072830200195, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 0.031086845323443413, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 10.220297813415527, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 0.15409478545188904, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 9.939523696899414, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 0.13119585812091827, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 17.10820198059082, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 0.005567716900259256, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 27.943742752075195, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 0.09071885049343109, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 13.53078842163086, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 0.11930897831916809, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 16.030439376831055, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 0.005517842248082161, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 10.114255905151367, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 0.02029741182923317, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 10.0430326461792, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 0.04633951187133789, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 9.627289772033691, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 0.0950353816151619, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 9.630903244018555, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 0.07177967578172684, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 16.538850784301758, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 0.00304591772146523, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 25.203744888305664, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 0.053898297250270844, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 11.780048370361328, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 0.048239897936582565, "pnorm/_forward_module._fsdp_wrapped_module.norm.weight": 20.398645401000977, "gnorm/_forward_module._fsdp_wrapped_module.norm.weight": 0.006005723960697651, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 213.41806030273438, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.044928986579179764} +{"step": 1027604480, "pnorm/_forward_module._fsdp_wrapped_module.emb.weight": 3446.037353515625, "gnorm/_forward_module._fsdp_wrapped_module.emb.weight": 0.009929454885423183, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 16.193248748779297, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 0.00914438534528017, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 11.294499397277832, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 0.06050492450594902, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 11.244133949279785, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 0.06577881425619125, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 10.217061996459961, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 0.15256260335445404, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 9.935279846191406, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 0.14072757959365845, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 17.12205696105957, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 0.006274758372455835, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 27.965742111206055, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 0.09919345378875732, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 13.541003227233887, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 0.12673863768577576, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 16.031494140625, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 0.00510655902326107, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 10.117260932922363, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 0.020321596413850784, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 10.044510841369629, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 0.03378034383058548, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 9.62562084197998, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 0.09367751330137253, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 9.629971504211426, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 0.07383375614881516, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 16.562543869018555, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 0.0028755851089954376, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 25.26625633239746, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 0.06886530667543411, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 11.82253360748291, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 0.084812231361866, "pnorm/_forward_module._fsdp_wrapped_module.norm.weight": 20.468162536621094, "gnorm/_forward_module._fsdp_wrapped_module.norm.weight": 0.010980170220136642, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 213.99331665039062, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.05270728841423988} +{"step": 1048576000, "pnorm/_forward_module._fsdp_wrapped_module.emb.weight": 3444.24951171875, "gnorm/_forward_module._fsdp_wrapped_module.emb.weight": 0.00978483259677887, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 16.197296142578125, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 0.008004095405340195, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 11.30456829071045, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 0.06328956037759781, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 11.255013465881348, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 0.07071105390787125, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 10.213659286499023, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 0.11511854082345963, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 9.93122673034668, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 0.12010684609413147, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 17.13591766357422, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 0.004110554698854685, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 27.98797035217285, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 0.0840725228190422, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 13.551304817199707, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 0.11333959549665451, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 16.032487869262695, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 0.005182639230042696, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 10.11943531036377, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 0.019713396206498146, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 10.045258522033691, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 0.03455032408237457, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 9.623794555664062, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 0.09260982275009155, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 9.629145622253418, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 0.06941482424736023, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 16.58782958984375, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 0.0028689266182482243, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 25.33006477355957, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 0.05277024954557419, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 11.864324569702148, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 0.048470061272382736, "pnorm/_forward_module._fsdp_wrapped_module.norm.weight": 20.53529167175293, "gnorm/_forward_module._fsdp_wrapped_module.norm.weight": 0.006803110241889954, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 214.54847717285156, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.059829238802194595} +{"step": 1069547520, "pnorm/_forward_module._fsdp_wrapped_module.emb.weight": 3442.517333984375, "gnorm/_forward_module._fsdp_wrapped_module.emb.weight": 0.008624264039099216, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 16.201416015625, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 0.008778531104326248, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 11.314596176147461, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 0.04708728939294815, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 11.266160011291504, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 0.0574718713760376, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 10.210768699645996, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 0.1395953893661499, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 9.927224159240723, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 0.13402464985847473, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 17.148889541625977, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 0.005399015266448259, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 28.008438110351562, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 0.0952744334936142, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 13.560894012451172, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 0.133272185921669, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 16.03357696533203, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 0.007068205159157515, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 10.122481346130371, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 0.02513875439763069, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 10.046869277954102, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 0.03710569441318512, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 9.622275352478027, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 0.11778794229030609, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 9.628424644470215, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 0.07962162047624588, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 16.611295700073242, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 0.002889001974835992, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 25.390527725219727, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 0.05621635168790817, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 11.906821250915527, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 0.05587643012404442, "pnorm/_forward_module._fsdp_wrapped_module.norm.weight": 20.601112365722656, "gnorm/_forward_module._fsdp_wrapped_module.norm.weight": 0.006815570406615734, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 215.0784912109375, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.05311190336942673} +{"step": 1090519040, "pnorm/_forward_module._fsdp_wrapped_module.emb.weight": 3440.8408203125, "gnorm/_forward_module._fsdp_wrapped_module.emb.weight": 0.01026855781674385, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 16.205184936523438, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 0.00864876713603735, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 11.32310962677002, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 0.04436070844531059, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 11.27522087097168, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 0.049893446266651154, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 10.207731246948242, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 0.16618043184280396, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 9.923102378845215, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 0.175842747092247, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 17.16145133972168, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 0.006615485530346632, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 28.028072357177734, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 0.12828950583934784, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 13.56997299194336, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 0.16254281997680664, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 16.03468894958496, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 0.008624443784356117, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 10.125624656677246, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 0.0287017822265625, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 10.048273086547852, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 0.041550636291503906, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 9.620635032653809, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 0.14282317459583282, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 9.627264022827148, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 0.10037989169359207, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 16.634740829467773, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 0.003936893306672573, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 25.449987411499023, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 0.06943323463201523, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 11.946606636047363, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 0.07001592218875885, "pnorm/_forward_module._fsdp_wrapped_module.norm.weight": 20.664091110229492, "gnorm/_forward_module._fsdp_wrapped_module.norm.weight": 0.006117918994277716, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 215.56956481933594, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.05635504052042961} +{"step": 1111490560, "pnorm/_forward_module._fsdp_wrapped_module.emb.weight": 3439.219482421875, "gnorm/_forward_module._fsdp_wrapped_module.emb.weight": 0.009971288964152336, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 16.20914649963379, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 0.008973839692771435, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 11.332538604736328, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 0.03409070521593094, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 11.285845756530762, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 0.033509619534015656, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 10.20488166809082, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 0.14791229367256165, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 9.919381141662598, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 0.1403583437204361, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 17.174829483032227, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 0.005978717934340239, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 28.04833984375, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 0.1123996451497078, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 13.579345703125, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 0.1430024951696396, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 16.03571128845215, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 0.00748367328196764, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 10.12819766998291, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 0.025977449491620064, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 10.049087524414062, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 0.037539947777986526, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 9.618973731994629, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 0.12590260803699493, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 9.626043319702148, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 0.08509089052677155, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 16.65728187561035, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 0.0034779745619744062, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 25.506607055664062, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 0.06338959187269211, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 11.985607147216797, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 0.053900595754384995, "pnorm/_forward_module._fsdp_wrapped_module.norm.weight": 20.72557258605957, "gnorm/_forward_module._fsdp_wrapped_module.norm.weight": 0.007577377837151289, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 216.03855895996094, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.04857902601361275} +{"step": 1132462080, "pnorm/_forward_module._fsdp_wrapped_module.emb.weight": 3437.65185546875, "gnorm/_forward_module._fsdp_wrapped_module.emb.weight": 0.009730709716677666, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 16.21271324157715, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 0.009143799543380737, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 11.340728759765625, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 0.029756871983408928, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 11.295125961303711, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 0.028738783672451973, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 10.202224731445312, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 0.15690478682518005, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 9.915655136108398, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 0.1490059792995453, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 17.18719482421875, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 0.006307968869805336, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 28.066978454589844, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 0.10588210076093674, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 13.587742805480957, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 0.13365358114242554, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 16.036893844604492, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 0.006421090103685856, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 10.131132125854492, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 0.017795255407691002, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 10.050466537475586, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 0.028192289173603058, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 9.61745548248291, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 0.11287340521812439, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 9.625020980834961, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 0.0818285197019577, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 16.680570602416992, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 0.0030603178311139345, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 25.564632415771484, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 0.06168372184038162, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 12.02682876586914, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 0.06938248872756958, "pnorm/_forward_module._fsdp_wrapped_module.norm.weight": 20.786977767944336, "gnorm/_forward_module._fsdp_wrapped_module.norm.weight": 0.008343893103301525, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 216.4839630126953, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.05200536921620369} +{"step": 1153433600, "pnorm/_forward_module._fsdp_wrapped_module.emb.weight": 3436.138916015625, "gnorm/_forward_module._fsdp_wrapped_module.emb.weight": 0.009299912489950657, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 16.216289520263672, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 0.008435878902673721, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 11.349184036254883, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 0.043106064200401306, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 11.304683685302734, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 0.040081415325403214, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 10.199560165405273, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 0.136332169175148, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 9.911757469177246, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 0.13474923372268677, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 17.199235916137695, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 0.004895324353128672, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 28.08394432067871, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 0.09694412350654602, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 13.595634460449219, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 0.13018804788589478, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 16.038022994995117, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 0.006772953551262617, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 10.133454322814941, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 0.019640758633613586, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 10.05130386352539, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 0.031294774264097214, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 9.616033554077148, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 0.12847524881362915, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 9.623831748962402, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 0.0868765264749527, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 16.703683853149414, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 0.0033744308166205883, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 25.621383666992188, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 0.055571503937244415, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 12.065908432006836, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 0.06808987259864807, "pnorm/_forward_module._fsdp_wrapped_module.norm.weight": 20.846574783325195, "gnorm/_forward_module._fsdp_wrapped_module.norm.weight": 0.006625500041991472, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 216.91018676757812, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.04244154319167137} +{"step": 1174405120, "pnorm/_forward_module._fsdp_wrapped_module.emb.weight": 3434.6806640625, "gnorm/_forward_module._fsdp_wrapped_module.emb.weight": 0.007850795984268188, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 16.220224380493164, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 0.007386139128357172, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 11.358332633972168, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 0.0352761410176754, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 11.314704895019531, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 0.04602763429284096, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 10.196850776672363, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 0.11007487028837204, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 9.907829284667969, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 0.11069221794605255, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 17.210731506347656, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 0.004943848587572575, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 28.100183486938477, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 0.09004087001085281, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 13.60329818725586, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 0.1195949986577034, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 16.03917121887207, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 0.00589899905025959, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 10.135965347290039, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 0.026611419394612312, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 10.052498817443848, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 0.03896261751651764, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 9.61444091796875, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 0.10754099488258362, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 9.62263298034668, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 0.07655005156993866, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 16.726276397705078, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 0.004140240605920553, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 25.67574691772461, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 0.06646890193223953, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 12.103617668151855, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 0.07325095683336258, "pnorm/_forward_module._fsdp_wrapped_module.norm.weight": 20.90349006652832, "gnorm/_forward_module._fsdp_wrapped_module.norm.weight": 0.008029401302337646, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 217.30850219726562, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.05322575941681862} +{"step": 1195376640, "pnorm/_forward_module._fsdp_wrapped_module.emb.weight": 3433.27685546875, "gnorm/_forward_module._fsdp_wrapped_module.emb.weight": 0.0077155157923698425, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 16.223798751831055, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 0.006648490205407143, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 11.366748809814453, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 0.060654785484075546, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 11.324006080627441, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 0.08412153273820877, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 10.193946838378906, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 0.10927335917949677, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 9.903955459594727, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 0.11037576198577881, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 17.22219467163086, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 0.003920448943972588, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 28.11623191833496, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 0.0785650685429573, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 13.610637664794922, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 0.10350211709737778, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 16.040369033813477, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 0.004486848134547472, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 10.139015197753906, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 0.017352137714624405, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 10.05399227142334, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 0.03164348006248474, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 9.612871170043945, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 0.08219990879297256, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 9.621256828308105, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 0.0645219013094902, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 16.748336791992188, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 0.002740348456427455, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 25.727867126464844, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 0.050977859646081924, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 12.137543678283691, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 0.06052389740943909, "pnorm/_forward_module._fsdp_wrapped_module.norm.weight": 20.956892013549805, "gnorm/_forward_module._fsdp_wrapped_module.norm.weight": 0.007032841444015503, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 217.68075561523438, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.04571656882762909} +{"step": 1216348160, "pnorm/_forward_module._fsdp_wrapped_module.emb.weight": 3431.926025390625, "gnorm/_forward_module._fsdp_wrapped_module.emb.weight": 0.007863770239055157, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 16.227069854736328, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 0.007323430851101875, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 11.374228477478027, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 0.03832629695534706, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 11.332174301147461, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 0.03911198675632477, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 10.191109657287598, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 0.11926514655351639, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 9.90019416809082, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 0.11718250811100006, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 17.234296798706055, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 0.004963330924510956, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 28.13360595703125, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 0.08793753385543823, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 13.618661880493164, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 0.11836886405944824, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 16.04164695739746, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 0.005380373913794756, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 10.142683029174805, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 0.01772245392203331, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 10.055988311767578, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 0.026685789227485657, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 9.611409187316895, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 0.10055629163980484, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 9.620290756225586, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 0.07292991876602173, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 16.7701473236084, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 0.0033283475786447525, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 25.779558181762695, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 0.060798753052949905, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 12.172296524047852, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 0.08812569826841354, "pnorm/_forward_module._fsdp_wrapped_module.norm.weight": 21.00981330871582, "gnorm/_forward_module._fsdp_wrapped_module.norm.weight": 0.008378160186111927, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 218.03494262695312, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.050640881061553955} +{"step": 1237319680, "pnorm/_forward_module._fsdp_wrapped_module.emb.weight": 3430.629150390625, "gnorm/_forward_module._fsdp_wrapped_module.emb.weight": 0.006272678263485432, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 16.230152130126953, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 0.005749810487031937, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 11.381086349487305, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 0.039951108396053314, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 11.340339660644531, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 0.04543471336364746, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 10.188519477844238, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 0.09974642097949982, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 9.896693229675293, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 0.10262087732553482, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 17.245351791381836, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 0.0044397637248039246, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 28.1494197845459, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 0.07817881554365158, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 13.625653266906738, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 0.1074199229478836, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 16.042905807495117, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 0.005100808572024107, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 10.146344184875488, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 0.01523599587380886, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 10.05807113647461, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 0.02400236763060093, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 9.609823226928711, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 0.09417709708213806, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 9.619222640991211, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 0.06485588103532791, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 16.7918758392334, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 0.0024128053337335587, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 25.829843521118164, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 0.050385091453790665, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 12.206130027770996, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 0.06542279571294785, "pnorm/_forward_module._fsdp_wrapped_module.norm.weight": 21.061031341552734, "gnorm/_forward_module._fsdp_wrapped_module.norm.weight": 0.007627654820680618, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 218.36912536621094, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.044095348566770554} +{"step": 1258291200, "pnorm/_forward_module._fsdp_wrapped_module.emb.weight": 3429.38525390625, "gnorm/_forward_module._fsdp_wrapped_module.emb.weight": 0.0069666276685893536, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 16.23332977294922, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 0.005767478607594967, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 11.388737678527832, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 0.026585308834910393, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 11.348580360412598, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 0.029805025085806847, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 10.186094284057617, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 0.1032404825091362, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 9.893387794494629, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 0.10748517513275146, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 17.256574630737305, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 0.0043150573037564754, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 28.16501808166504, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 0.08038684725761414, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 13.632638931274414, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 0.10541785508394241, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 16.04422950744629, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 0.004452279303222895, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 10.150073051452637, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 0.020161623135209084, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 10.060173034667969, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 0.030959688127040863, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 9.60853099822998, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 0.089027039706707, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 9.618327140808105, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 0.06458505243062973, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 16.812929153442383, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 0.0022512057330459356, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 25.87826919555664, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 0.049258630722761154, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 12.23827838897705, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 0.04994439333677292, "pnorm/_forward_module._fsdp_wrapped_module.norm.weight": 21.11034393310547, "gnorm/_forward_module._fsdp_wrapped_module.norm.weight": 0.0064799124374985695, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 218.68470764160156, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.04293251037597656} +{"step": 1279262720, "pnorm/_forward_module._fsdp_wrapped_module.emb.weight": 3428.19384765625, "gnorm/_forward_module._fsdp_wrapped_module.emb.weight": 0.0065884897485375404, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 16.236194610595703, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 0.006319230888038874, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 11.395065307617188, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 0.030942197889089584, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 11.355671882629395, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 0.03875230997800827, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 10.18364143371582, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 0.10256510972976685, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 9.890143394470215, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 0.1022244542837143, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 17.26738166809082, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 0.0039851185865700245, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 28.17993927001953, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 0.07449103891849518, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 13.639427185058594, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 0.09842724353075027, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 16.045135498046875, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 0.004935099743306637, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 10.151992797851562, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 0.01375445444136858, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 10.061195373535156, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 0.023662462830543518, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 9.607202529907227, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 0.08667124807834625, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 9.617481231689453, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 0.06028447300195694, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 16.83235740661621, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 0.002526467200368643, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 25.923816680908203, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 0.04703793674707413, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 12.269267082214355, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 0.062350139021873474, "pnorm/_forward_module._fsdp_wrapped_module.norm.weight": 21.15791893005371, "gnorm/_forward_module._fsdp_wrapped_module.norm.weight": 0.0075192018412053585, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 218.98011779785156, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.043046459555625916} +{"step": 1300234240, "pnorm/_forward_module._fsdp_wrapped_module.emb.weight": 3427.0556640625, "gnorm/_forward_module._fsdp_wrapped_module.emb.weight": 0.00769457733258605, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 16.238765716552734, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 0.00658387690782547, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 11.400086402893066, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 0.04180847480893135, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 11.361357688903809, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 0.05409776791930199, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 10.181356430053711, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 0.12499529123306274, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 9.887092590332031, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 0.12542890012264252, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 17.27750015258789, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 0.005234021693468094, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 28.19329261779785, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 0.09272483736276627, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 13.64537525177002, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 0.1220213994383812, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 16.04612922668457, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 0.00643793074414134, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 10.154338836669922, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 0.019489696249365807, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 10.062317848205566, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 0.031072869896888733, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 9.60588550567627, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 0.1180233508348465, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 9.616326332092285, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 0.08005329966545105, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 16.851926803588867, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 0.00302801001816988, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 25.968914031982422, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 0.053613126277923584, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 12.299240112304688, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 0.051868945360183716, "pnorm/_forward_module._fsdp_wrapped_module.norm.weight": 21.203704833984375, "gnorm/_forward_module._fsdp_wrapped_module.norm.weight": 0.00534785958006978, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 219.2627410888672, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.039490874856710434} +{"step": 1321205760, "pnorm/_forward_module._fsdp_wrapped_module.emb.weight": 3425.9677734375, "gnorm/_forward_module._fsdp_wrapped_module.emb.weight": 0.0062333052046597, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 16.241300582885742, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 0.005425793118774891, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 11.405237197875977, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 0.023333514109253883, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 11.36725902557373, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 0.027926083654165268, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 10.1790189743042, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 0.09388205409049988, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 9.884154319763184, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 0.09333381056785583, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 17.287748336791992, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 0.003599160350859165, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 28.207212448120117, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 0.06634965538978577, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 13.651686668395996, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 0.09099330753087997, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 16.047286987304688, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 0.0040442305617034435, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 10.157383918762207, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 0.01502846460789442, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 10.063725471496582, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 0.021988021209836006, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 9.604588508605957, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 0.08107538521289825, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 9.61510944366455, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 0.06028524413704872, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 16.87055015563965, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 0.0025987299159169197, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 26.01114273071289, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 0.04875287413597107, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 12.328788757324219, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 0.053349465131759644, "pnorm/_forward_module._fsdp_wrapped_module.norm.weight": 21.24777603149414, "gnorm/_forward_module._fsdp_wrapped_module.norm.weight": 0.0066513605415821075, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 219.52505493164062, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.04380415380001068} +{"step": 1342177280, "pnorm/_forward_module._fsdp_wrapped_module.emb.weight": 3424.930419921875, "gnorm/_forward_module._fsdp_wrapped_module.emb.weight": 0.008641323074698448, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 16.243968963623047, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 0.0075434609316289425, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 11.411056518554688, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 0.05193326622247696, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 11.37330150604248, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 0.05013234168291092, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 10.176656723022461, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 0.11263669282197952, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 9.881083488464355, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 0.11923355609178543, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 17.296903610229492, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 0.005027621053159237, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 28.219406127929688, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 0.09199301153421402, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 13.657255172729492, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 0.12241055071353912, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 16.048612594604492, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 0.00561127346009016, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 10.160741806030273, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 0.022320246323943138, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 10.065299987792969, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 0.03297416493296623, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 9.603364944458008, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 0.10667802393436432, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 9.614225387573242, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 0.0810953751206398, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 16.88959312438965, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 0.0029065085109323263, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 26.052976608276367, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 0.06507112085819244, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 12.35754680633545, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 0.09034331142902374, "pnorm/_forward_module._fsdp_wrapped_module.norm.weight": 21.289859771728516, "gnorm/_forward_module._fsdp_wrapped_module.norm.weight": 0.0085300263017416, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 219.7686309814453, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.046347759664058685} +{"step": 1363148800, "pnorm/_forward_module._fsdp_wrapped_module.emb.weight": 3423.94287109375, "gnorm/_forward_module._fsdp_wrapped_module.emb.weight": 0.006202848628163338, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 16.246627807617188, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 0.005375347100198269, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 11.41672420501709, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 0.03503220155835152, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 11.379593849182129, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 0.03562033921480179, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 10.17443561553955, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 0.10456008464097977, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 9.878202438354492, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 0.11471565067768097, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 17.305904388427734, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 0.0043088095262646675, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 28.23149299621582, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 0.08183623850345612, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 13.662761688232422, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 0.09946063160896301, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 16.050048828125, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 0.0040208701975643635, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 10.164325714111328, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 0.015220793895423412, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 10.067547798156738, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 0.02187163569033146, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 9.602150917053223, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 0.08715656399726868, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 9.613146781921387, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 0.06293493509292603, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 16.907360076904297, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 0.0025365459732711315, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 26.091978073120117, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 0.04613498970866203, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 12.383490562438965, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 0.04977630823850632, "pnorm/_forward_module._fsdp_wrapped_module.norm.weight": 21.329378128051758, "gnorm/_forward_module._fsdp_wrapped_module.norm.weight": 0.004776066169142723, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 219.99708557128906, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.03494959697127342} +{"step": 1384120320, "pnorm/_forward_module._fsdp_wrapped_module.emb.weight": 3423.0048828125, "gnorm/_forward_module._fsdp_wrapped_module.emb.weight": 0.00700815673917532, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 16.249286651611328, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 0.006160500925034285, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 11.422157287597656, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 0.04419896751642227, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 11.3854398727417, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 0.06727112829685211, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 10.172539710998535, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 0.10567477345466614, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 9.875517845153809, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 0.10752657800912857, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 17.315261840820312, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 0.0038992916233837605, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 28.244131088256836, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 0.07699379324913025, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 13.668540000915527, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 0.10187561064958572, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 16.051090240478516, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 0.004435471724718809, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 10.166449546813965, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 0.018681906163692474, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 10.06871223449707, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 0.03509608656167984, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 9.60091495513916, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 0.08528001606464386, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 9.612183570861816, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 0.06258226186037064, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 16.92464256286621, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 0.002472377149388194, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 26.130380630493164, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 0.045475441962480545, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 12.40904712677002, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 0.04826957359910011, "pnorm/_forward_module._fsdp_wrapped_module.norm.weight": 21.36786651611328, "gnorm/_forward_module._fsdp_wrapped_module.norm.weight": 0.005792638752609491, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 220.2128448486328, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.03756853938102722} +{"step": 1405091840, "pnorm/_forward_module._fsdp_wrapped_module.emb.weight": 3422.115234375, "gnorm/_forward_module._fsdp_wrapped_module.emb.weight": 0.005191043484956026, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 16.25149154663086, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 0.004410609602928162, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 11.426297187805176, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 0.023404428735375404, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 11.390363693237305, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 0.02749018743634224, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 10.170498847961426, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 0.08343446999788284, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 9.872868537902832, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 0.1009281724691391, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 17.323877334594727, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 0.0035906631965190172, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 28.255290985107422, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 0.07413017004728317, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 13.67351245880127, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 0.10528149455785751, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 16.052383422851562, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 0.004604047629982233, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 10.169829368591309, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 0.015268483199179173, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 10.070518493652344, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 0.024360043928027153, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 9.599656105041504, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 0.08355136960744858, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 9.611531257629395, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 0.062217626720666885, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 16.940753936767578, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 0.0026344931684434414, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 26.16537857055664, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 0.04932146891951561, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 12.43194580078125, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 0.07053253054618835, "pnorm/_forward_module._fsdp_wrapped_module.norm.weight": 21.403846740722656, "gnorm/_forward_module._fsdp_wrapped_module.norm.weight": 0.005378652364015579, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 220.41221618652344, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.038174111396074295} +{"step": 1426063360, "pnorm/_forward_module._fsdp_wrapped_module.emb.weight": 3421.27294921875, "gnorm/_forward_module._fsdp_wrapped_module.emb.weight": 0.007168137934058905, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 16.253671646118164, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 0.006558457855135202, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 11.430529594421387, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 0.03256668895483017, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 11.395303726196289, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 0.03289264068007469, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 10.168424606323242, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 0.1119476854801178, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 9.870593070983887, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 0.11577250808477402, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 17.331979751586914, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 0.003976805601269007, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 28.265491485595703, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 0.07969830930233002, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 13.678033828735352, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 0.10741647332906723, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 16.053634643554688, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 0.0046170796267688274, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 10.172657012939453, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 0.015575756318867207, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 10.071869850158691, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 0.023871826007962227, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 9.598596572875977, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 0.08978551626205444, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 9.610682487487793, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 0.06002838909626007, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 16.956636428833008, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 0.0027343982364982367, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 26.198272705078125, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 0.04037170112133026, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 12.453763961791992, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 0.04804414138197899, "pnorm/_forward_module._fsdp_wrapped_module.norm.weight": 21.438005447387695, "gnorm/_forward_module._fsdp_wrapped_module.norm.weight": 0.004779690410941839, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 220.5999755859375, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.03466874733567238} +{"step": 1447034880, "pnorm/_forward_module._fsdp_wrapped_module.emb.weight": 3420.47705078125, "gnorm/_forward_module._fsdp_wrapped_module.emb.weight": 0.0045857904478907585, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 16.255990982055664, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 0.004018521402031183, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 11.43539047241211, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 0.029106829315423965, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 11.400599479675293, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 0.03222278505563736, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 10.16641616821289, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 0.06999915093183517, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 9.868282318115234, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 0.07469427585601807, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 17.339632034301758, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 0.0029472510796040297, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 28.275243759155273, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 0.05847550556063652, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 13.682332038879395, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 0.07568234950304031, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 16.05474090576172, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 0.0029225859325379133, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 10.175225257873535, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 0.013041293248534203, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 10.073189735412598, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 0.019586658105254173, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 9.59756088256836, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 0.06073623150587082, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 9.609756469726562, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 0.045956071466207504, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 16.972213745117188, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 0.0017299855826422572, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 26.23110580444336, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 0.034564629197120667, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 12.47549819946289, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 0.0424233078956604, "pnorm/_forward_module._fsdp_wrapped_module.norm.weight": 21.4707088470459, "gnorm/_forward_module._fsdp_wrapped_module.norm.weight": 0.0054311552084982395, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 220.77401733398438, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.036324642598629} +{"step": 1468006400, "pnorm/_forward_module._fsdp_wrapped_module.emb.weight": 3419.726806640625, "gnorm/_forward_module._fsdp_wrapped_module.emb.weight": 0.0054793208837509155, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 16.25821304321289, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 0.004642493091523647, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 11.439760208129883, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 0.02158850058913231, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 11.40559196472168, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 0.023282712325453758, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 10.164517402648926, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 0.08984951674938202, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 9.865758895874023, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 0.08897648006677628, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 17.346956253051758, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 0.003514916403219104, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 28.28435516357422, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 0.0664466917514801, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 13.686474800109863, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 0.09201019257307053, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 16.055877685546875, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 0.004861479625105858, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 10.177939414978027, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 0.013152259401977062, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 10.074511528015137, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 0.019616056233644485, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 9.596487998962402, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 0.0792565569281578, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 9.608858108520508, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 0.050673745572566986, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 16.987468719482422, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 0.0020438567735254765, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 26.263240814208984, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 0.03779151290655136, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 12.497279167175293, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 0.048300761729478836, "pnorm/_forward_module._fsdp_wrapped_module.norm.weight": 21.501649856567383, "gnorm/_forward_module._fsdp_wrapped_module.norm.weight": 0.005856600124388933, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 220.93466186523438, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.03504595533013344} +{"step": 1488977920, "pnorm/_forward_module._fsdp_wrapped_module.emb.weight": 3419.02197265625, "gnorm/_forward_module._fsdp_wrapped_module.emb.weight": 0.006700099445879459, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 16.260051727294922, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 0.0057947770692408085, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 11.443024635314941, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 0.02468898892402649, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 11.409212112426758, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 0.03010287880897522, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 10.162768363952637, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 0.10378193855285645, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 9.863489151000977, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 0.1040961891412735, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 17.35447120666504, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 0.004132906440645456, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 28.293855667114258, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 0.08140960335731506, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 13.69075870513916, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 0.11202050000429153, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 16.056873321533203, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 0.004936569835990667, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 10.180776596069336, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 0.017637334764003754, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 10.075932502746582, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 0.02684006281197071, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 9.595383644104004, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 0.09384343028068542, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 9.60788631439209, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 0.0654662624001503, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 17.001880645751953, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 0.0024330823216587305, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 26.293407440185547, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 0.0612332783639431, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 12.517095565795898, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 0.08845873922109604, "pnorm/_forward_module._fsdp_wrapped_module.norm.weight": 21.53123664855957, "gnorm/_forward_module._fsdp_wrapped_module.norm.weight": 0.00460976455360651, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 221.08831787109375, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.0409766249358654} +{"step": 1509949440, "pnorm/_forward_module._fsdp_wrapped_module.emb.weight": 3418.360107421875, "gnorm/_forward_module._fsdp_wrapped_module.emb.weight": 0.006704994942992926, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 16.262025833129883, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 0.005590212531387806, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 11.446767807006836, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 0.023232880979776382, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 11.413565635681152, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 0.02310197986662388, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 10.161060333251953, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 0.1062401607632637, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 9.861284255981445, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 0.1079760491847992, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 17.361495971679688, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 0.003939877729862928, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 28.302671432495117, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 0.08400525897741318, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 13.694464683532715, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 0.11063055694103241, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 16.058197021484375, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 0.005772160831838846, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 10.184082984924316, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 0.012736961245536804, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 10.078004837036133, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 0.02023336850106716, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 9.594613075256348, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 0.09326664358377457, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 9.607183456420898, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 0.06302933394908905, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 17.01509666442871, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 0.002493033418431878, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 26.321027755737305, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 0.04711984470486641, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 12.53496265411377, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 0.06136924773454666, "pnorm/_forward_module._fsdp_wrapped_module.norm.weight": 21.558626174926758, "gnorm/_forward_module._fsdp_wrapped_module.norm.weight": 0.0041299364529550076, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 221.23072814941406, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.03432552516460419} +{"step": 1530920960, "pnorm/_forward_module._fsdp_wrapped_module.emb.weight": 3417.740478515625, "gnorm/_forward_module._fsdp_wrapped_module.emb.weight": 0.005341610871255398, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 16.263797760009766, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 0.004830148071050644, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 11.450090408325195, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 0.029703496024012566, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 11.417470932006836, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 0.02700534276664257, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 10.159435272216797, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 0.07870030403137207, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 9.859210014343262, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 0.07761039584875107, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 17.367815017700195, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 0.0030328731518238783, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 28.310470581054688, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 0.05783132091164589, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 13.69786548614502, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 0.07827406376600266, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 16.059310913085938, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 0.0036985676269978285, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 10.186580657958984, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 0.011954938992857933, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 10.07929515838623, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 0.01670484058558941, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 9.593782424926758, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 0.07040456682443619, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 9.606456756591797, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 0.04921115189790726, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 17.027681350708008, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 0.002503141760826111, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 26.3474178314209, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 0.03654402866959572, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 12.55202579498291, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 0.04808041825890541, "pnorm/_forward_module._fsdp_wrapped_module.norm.weight": 21.584110260009766, "gnorm/_forward_module._fsdp_wrapped_module.norm.weight": 0.005443213973194361, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 221.36061096191406, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.03484707698225975} +{"step": 1551892480, "pnorm/_forward_module._fsdp_wrapped_module.emb.weight": 3417.162353515625, "gnorm/_forward_module._fsdp_wrapped_module.emb.weight": 0.005179878324270248, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 16.265602111816406, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 0.0044598947279155254, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 11.453373908996582, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 0.021393688395619392, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 11.421281814575195, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 0.019780278205871582, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 10.157936096191406, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 0.07768766582012177, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 9.85708999633789, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 0.07879400253295898, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 17.373950958251953, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 0.0030542274471372366, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 28.318260192871094, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 0.061752479523420334, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 13.701133728027344, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 0.08436264097690582, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 16.060197830200195, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 0.004049370996654034, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 10.188267707824707, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 0.013915996998548508, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 10.080367088317871, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 0.020605219528079033, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 9.5928955078125, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 0.07085207104682922, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 9.605673789978027, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 0.049672335386276245, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 17.04004669189453, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 0.0024735445622354746, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 26.37294578552246, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 0.036162421107292175, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 12.568534851074219, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 0.04665147140622139, "pnorm/_forward_module._fsdp_wrapped_module.norm.weight": 21.60832405090332, "gnorm/_forward_module._fsdp_wrapped_module.norm.weight": 0.0049085430800914764, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 221.48243713378906, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.03406120091676712} +{"step": 1572864000, "pnorm/_forward_module._fsdp_wrapped_module.emb.weight": 3416.625, "gnorm/_forward_module._fsdp_wrapped_module.emb.weight": 0.005164829082787037, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 16.267412185668945, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 0.004266987554728985, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 11.45673656463623, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 0.022454602643847466, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 11.42552661895752, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 0.022509459406137466, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 10.156493186950684, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 0.07549723237752914, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 9.855374336242676, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 0.0796351283788681, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 17.379985809326172, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 0.0029318449087440968, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 28.325891494750977, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 0.05951167643070221, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 13.704334259033203, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 0.08093056827783585, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 16.061256408691406, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 0.003248317865654826, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 10.190642356872559, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 0.014530373737215996, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 10.081815719604492, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 0.020038720220327377, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 9.59220027923584, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 0.06255616247653961, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 9.605088233947754, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 0.046527739614248276, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 17.051651000976562, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 0.0019086907850578427, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 26.396839141845703, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 0.036052580922842026, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 12.584236145019531, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 0.04493214562535286, "pnorm/_forward_module._fsdp_wrapped_module.norm.weight": 21.631032943725586, "gnorm/_forward_module._fsdp_wrapped_module.norm.weight": 0.006316479295492172, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 221.594970703125, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.03578050434589386} +{"step": 1593835520, "pnorm/_forward_module._fsdp_wrapped_module.emb.weight": 3416.12744140625, "gnorm/_forward_module._fsdp_wrapped_module.emb.weight": 0.006071893032640219, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 16.268957138061523, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 0.0055617582984268665, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 11.459427833557129, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 0.038441430777311325, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 11.428824424743652, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 0.043351754546165466, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 10.155071258544922, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 0.08532529324293137, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 9.853669166564941, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 0.08614030480384827, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 17.385677337646484, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 0.0029833358712494373, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 28.33270263671875, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 0.06593278795480728, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 13.707342147827148, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 0.08685112744569778, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 16.062326431274414, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 0.003534947521984577, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 10.19336986541748, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 0.014551769942045212, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 10.083489418029785, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 0.021972941234707832, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 9.591444969177246, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 0.06440426409244537, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 9.604432106018066, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 0.046227119863033295, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 17.062219619750977, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 0.0024051230866461992, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 26.418441772460938, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 0.0341079942882061, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 12.598328590393066, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 0.03943120315670967, "pnorm/_forward_module._fsdp_wrapped_module.norm.weight": 21.652313232421875, "gnorm/_forward_module._fsdp_wrapped_module.norm.weight": 0.005536239128559828, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 221.69833374023438, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.03261781483888626} +{"step": 1614807040, "pnorm/_forward_module._fsdp_wrapped_module.emb.weight": 3415.667724609375, "gnorm/_forward_module._fsdp_wrapped_module.emb.weight": 0.005760683678090572, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 16.270395278930664, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 0.004762878175824881, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 11.461902618408203, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 0.032732121646404266, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 11.43183708190918, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 0.03630293905735016, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 10.15383243560791, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 0.07918681204319, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 9.852099418640137, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 0.08239766210317612, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 17.390430450439453, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 0.002979259705170989, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 28.338125228881836, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 0.06508234888315201, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 13.709659576416016, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 0.09183559566736221, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 16.063373565673828, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 0.004242504481226206, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 10.195927619934082, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 0.013879615813493729, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 10.085204124450684, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 0.02004999667406082, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 9.590733528137207, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 0.07927785813808441, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 9.60391616821289, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 0.05354803428053856, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 17.07264518737793, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 0.0024411228951066732, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 26.43964958190918, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 0.03865773603320122, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 12.612614631652832, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 0.04682391136884689, "pnorm/_forward_module._fsdp_wrapped_module.norm.weight": 21.672073364257812, "gnorm/_forward_module._fsdp_wrapped_module.norm.weight": 0.005071944557130337, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 221.79302978515625, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.03456968441605568} +{"step": 1635778560, "pnorm/_forward_module._fsdp_wrapped_module.emb.weight": 3415.244384765625, "gnorm/_forward_module._fsdp_wrapped_module.emb.weight": 0.005938464310020208, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 16.27164649963379, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 0.0047223325818777084, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 11.463929176330566, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 0.03287379816174507, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 11.434252738952637, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 0.03265097737312317, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 10.152726173400879, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 0.08294667303562164, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 9.850664138793945, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 0.08181091398000717, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 17.395116806030273, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 0.0031805813778191805, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 28.343734741210938, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 0.05937449634075165, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 13.712191581726074, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 0.079133041203022, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 16.064285278320312, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 0.0038035865873098373, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 10.198049545288086, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 0.012925773859024048, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 10.086583137512207, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 0.019342906773090363, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 9.590043067932129, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 0.06813032180070877, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 9.603388786315918, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 0.046078309416770935, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 17.08218765258789, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 0.002002487890422344, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 26.45928192138672, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 0.033932462334632874, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 12.6260347366333, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 0.04106909781694412, "pnorm/_forward_module._fsdp_wrapped_module.norm.weight": 21.690418243408203, "gnorm/_forward_module._fsdp_wrapped_module.norm.weight": 0.005785311106592417, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 221.87925720214844, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.0338674895465374} +{"step": 1656750080, "pnorm/_forward_module._fsdp_wrapped_module.emb.weight": 3414.8564453125, "gnorm/_forward_module._fsdp_wrapped_module.emb.weight": 0.005387325771152973, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 16.272836685180664, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 0.004478550050407648, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 11.466192245483398, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 0.021052151918411255, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 11.436756134033203, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 0.018114609643816948, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 10.151634216308594, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 0.07443387061357498, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 9.84922981262207, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 0.07849429547786713, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 17.399866104125977, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 0.0027058960404247046, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 28.349485397338867, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 0.05572831630706787, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 13.714681625366211, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 0.07633906602859497, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 16.06488609313965, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 0.0033458697143942118, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 10.19936466217041, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 0.01400861144065857, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 10.087372779846191, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 0.0239997711032629, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 9.589407920837402, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 0.06359367072582245, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 9.602874755859375, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 0.04637015610933304, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 17.09127426147461, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 0.002057528356090188, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 26.47804832458496, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 0.032785359770059586, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 12.637818336486816, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 0.042927537113428116, "pnorm/_forward_module._fsdp_wrapped_module.norm.weight": 21.70705223083496, "gnorm/_forward_module._fsdp_wrapped_module.norm.weight": 0.005926025565713644, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 221.95639038085938, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.033767219632864} +{"step": 1677721600, "pnorm/_forward_module._fsdp_wrapped_module.emb.weight": 3414.501708984375, "gnorm/_forward_module._fsdp_wrapped_module.emb.weight": 0.006362547632306814, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 16.273929595947266, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 0.005721115041524172, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 11.468173027038574, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 0.02378145232796669, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 11.439054489135742, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 0.02626972459256649, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 10.150623321533203, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 0.09084820747375488, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 9.847840309143066, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 0.09152720868587494, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 17.404348373413086, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 0.00338570773601532, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 28.354917526245117, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 0.06606502085924149, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 13.717087745666504, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 0.08900290727615356, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 16.06573486328125, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 0.00431368313729763, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 10.201351165771484, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 0.01448660995811224, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 10.088716506958008, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 0.02204175479710102, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 9.588835716247559, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 0.07543475925922394, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 9.602350234985352, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 0.05390310287475586, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 17.099519729614258, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 0.0021576599683612585, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 26.4954891204834, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 0.03930941969156265, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 12.648964881896973, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 0.046092595905065536, "pnorm/_forward_module._fsdp_wrapped_module.norm.weight": 21.722421646118164, "gnorm/_forward_module._fsdp_wrapped_module.norm.weight": 0.0061258734203875065, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 222.02671813964844, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.0340895801782608} +{"step": 1698693120, "pnorm/_forward_module._fsdp_wrapped_module.emb.weight": 3414.179931640625, "gnorm/_forward_module._fsdp_wrapped_module.emb.weight": 0.003930698148906231, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 16.274822235107422, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 0.003638041904196143, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 11.469675064086914, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 0.01728181540966034, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 11.4407958984375, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 0.019279073923826218, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 10.149639129638672, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 0.05860235542058945, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 9.846571922302246, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 0.06132687255740166, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 17.408496856689453, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 0.0024029251653701067, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 28.359874725341797, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 0.04738396406173706, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 13.719185829162598, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 0.0619145967066288, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 16.06633186340332, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 0.0022836520802229643, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 10.202630996704102, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 0.011879628524184227, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 10.089510917663574, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 0.01619240641593933, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 9.588330268859863, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 0.042208924889564514, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 9.601845741271973, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 0.03386273607611656, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 17.1070556640625, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 0.0017064404673874378, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 26.511064529418945, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 0.02889321744441986, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 12.65922737121582, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 0.03998761996626854, "pnorm/_forward_module._fsdp_wrapped_module.norm.weight": 21.736400604248047, "gnorm/_forward_module._fsdp_wrapped_module.norm.weight": 0.006048366893082857, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 222.09056091308594, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.030641397461295128} +{"step": 1719664640, "pnorm/_forward_module._fsdp_wrapped_module.emb.weight": 3413.888916015625, "gnorm/_forward_module._fsdp_wrapped_module.emb.weight": 0.0059416829608380795, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 16.27587890625, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 0.005111759062856436, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 11.471512794494629, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 0.03158215805888176, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 11.443039894104004, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 0.03494778275489807, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 10.148763656616211, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 0.08638744056224823, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 9.845527648925781, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 0.08042891323566437, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 17.411909103393555, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 0.002840160857886076, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 28.363767623901367, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 0.05846341699361801, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 13.72085189819336, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 0.08161065727472305, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 16.06711196899414, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 0.0035909186117351055, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 10.204202651977539, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 0.013997094705700874, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 10.090627670288086, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 0.023512819781899452, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 9.587922096252441, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 0.07475464791059494, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 9.6014404296875, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 0.05309059098362923, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 17.11412239074707, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 0.0019710503984242678, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 26.525487899780273, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 0.036766763776540756, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 12.668700218200684, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 0.05253206565976143, "pnorm/_forward_module._fsdp_wrapped_module.norm.weight": 21.74913215637207, "gnorm/_forward_module._fsdp_wrapped_module.norm.weight": 0.005228972528129816, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 222.14784240722656, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.032744843512773514} +{"step": 1740636160, "pnorm/_forward_module._fsdp_wrapped_module.emb.weight": 3413.6279296875, "gnorm/_forward_module._fsdp_wrapped_module.emb.weight": 0.004546570125967264, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 16.27669334411621, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 0.0037983397487550974, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 11.472822189331055, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 0.017782751470804214, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 11.44474983215332, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 0.01990295760333538, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 10.147834777832031, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 0.06752770394086838, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 9.844493865966797, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 0.07427463680505753, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 17.41545295715332, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 0.0028543106745928526, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 28.367929458618164, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 0.0568307600915432, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 13.722661018371582, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 0.07532751560211182, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 16.06774139404297, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 0.0031099796760827303, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 10.20566177368164, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 0.012401482090353966, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 10.09164810180664, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 0.020476700738072395, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 9.587447166442871, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 0.0563468262553215, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 9.601059913635254, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 0.041712112724781036, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 17.12043571472168, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 0.002194578293710947, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 26.538549423217773, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 0.032863013446331024, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 12.677268028259277, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 0.043788839131593704, "pnorm/_forward_module._fsdp_wrapped_module.norm.weight": 21.760534286499023, "gnorm/_forward_module._fsdp_wrapped_module.norm.weight": 0.005691679660230875, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 222.19859313964844, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.03115684725344181} +{"step": 1761607680, "pnorm/_forward_module._fsdp_wrapped_module.emb.weight": 3413.395263671875, "gnorm/_forward_module._fsdp_wrapped_module.emb.weight": 0.004461491946130991, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 16.27754783630371, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 0.004026042763143778, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 11.474278450012207, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 0.020404918119311333, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 11.4464693069458, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 0.020257657393813133, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 10.147032737731934, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 0.06339200586080551, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 9.843542098999023, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 0.06088220328092575, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 17.418466567993164, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 0.002337242243811488, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 28.371431350708008, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 0.04483336582779884, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 13.724263191223145, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 0.06179923936724663, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 16.068317413330078, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 0.002222386421635747, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 10.206860542297363, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 0.010596404783427715, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 10.092386245727539, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 0.014600816182792187, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 9.587038040161133, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 0.045123904943466187, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 9.600692749023438, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 0.03711530938744545, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 17.126379013061523, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 0.0014734361320734024, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 26.55061149597168, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 0.0301237553358078, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 12.685029029846191, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 0.04090886190533638, "pnorm/_forward_module._fsdp_wrapped_module.norm.weight": 21.770700454711914, "gnorm/_forward_module._fsdp_wrapped_module.norm.weight": 0.005970557685941458, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 222.24342346191406, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.034137528389692307} +{"step": 1782579200, "pnorm/_forward_module._fsdp_wrapped_module.emb.weight": 3413.189453125, "gnorm/_forward_module._fsdp_wrapped_module.emb.weight": 0.004183728713542223, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 16.278242111206055, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 0.0032885773107409477, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 11.475320816040039, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 0.026327701285481453, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 11.44770336151123, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 0.03336409851908684, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 10.146395683288574, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 0.05550439655780792, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 9.842756271362305, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 0.05574306100606918, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 17.421092987060547, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 0.0019009546376764774, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 28.374496459960938, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 0.04298752546310425, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 13.725543975830078, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 0.05840081721544266, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 16.068849563598633, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 0.002146870130673051, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 10.20786190032959, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 0.010786266066133976, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 10.093013763427734, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 0.015698937699198723, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 9.586679458618164, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 0.04533187299966812, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 9.600358963012695, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 0.0350378081202507, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 17.13176155090332, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 0.0017046888824552298, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 26.561466217041016, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 0.027811449021100998, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 12.691972732543945, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 0.03586333245038986, "pnorm/_forward_module._fsdp_wrapped_module.norm.weight": 21.779766082763672, "gnorm/_forward_module._fsdp_wrapped_module.norm.weight": 0.005533706862479448, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 222.28285217285156, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.029739784076809883} +{"step": 1803550720, "pnorm/_forward_module._fsdp_wrapped_module.emb.weight": 3413.00830078125, "gnorm/_forward_module._fsdp_wrapped_module.emb.weight": 0.004748993553221226, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 16.278902053833008, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 0.00431397370994091, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 11.476434707641602, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 0.017932087182998657, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 11.449048042297363, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 0.018153773620724678, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 10.145761489868164, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 0.06799861043691635, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 9.842047691345215, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 0.06578406691551208, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 17.423555374145508, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 0.0028232871554791927, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 28.377277374267578, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 0.0483730286359787, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 13.726768493652344, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 0.06884529441595078, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 16.06923484802246, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 0.0029584732837975025, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 10.208534240722656, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 0.012336024083197117, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 10.093428611755371, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 0.01973726786673069, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 9.586335182189941, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 0.05313770845532417, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 9.600006103515625, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 0.04090322554111481, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 17.136518478393555, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 0.00183152558747679, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 26.570945739746094, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 0.030713895335793495, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 12.698098182678223, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 0.03953462094068527, "pnorm/_forward_module._fsdp_wrapped_module.norm.weight": 21.78766632080078, "gnorm/_forward_module._fsdp_wrapped_module.norm.weight": 0.006118585355579853, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 222.3171844482422, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.031109068542718887} +{"step": 1824522240, "pnorm/_forward_module._fsdp_wrapped_module.emb.weight": 3412.85107421875, "gnorm/_forward_module._fsdp_wrapped_module.emb.weight": 0.004104124382138252, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 16.279420852661133, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 0.0036377902142703533, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 11.477237701416016, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 0.021200550720095634, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 11.450021743774414, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 0.021466625854372978, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 10.145201683044434, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 0.0589696429669857, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 9.841405868530273, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 0.06131362542510033, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 17.425750732421875, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 0.0022637636866420507, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 28.379806518554688, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 0.04745417460799217, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 13.727906227111816, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 0.06338111311197281, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 16.06963348388672, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 0.00246245670132339, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 10.209322929382324, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 0.012222180142998695, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 10.093952178955078, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 0.01686403714120388, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 9.586060523986816, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 0.046582531183958054, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 9.599742889404297, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 0.03610854223370552, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 17.140697479248047, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 0.0018763572443276644, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 26.57929801940918, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 0.028184784576296806, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 12.70369815826416, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 0.03530401363968849, "pnorm/_forward_module._fsdp_wrapped_module.norm.weight": 21.7945499420166, "gnorm/_forward_module._fsdp_wrapped_module.norm.weight": 0.005990584380924702, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 222.34671020507812, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.031121667474508286} +{"step": 1845493760, "pnorm/_forward_module._fsdp_wrapped_module.emb.weight": 3412.716064453125, "gnorm/_forward_module._fsdp_wrapped_module.emb.weight": 0.004191262181848288, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 16.279869079589844, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 0.0037160080391913652, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 11.477880477905273, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 0.024711498990654945, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 11.45083999633789, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 0.02230069600045681, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 10.144718170166016, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 0.056218221783638, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 9.840858459472656, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 0.0573573112487793, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 17.42792510986328, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 0.0021813842467963696, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 28.382469177246094, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 0.044865112751722336, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 13.729080200195312, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 0.060073234140872955, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 16.070070266723633, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 0.0025349806528538465, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 10.210180282592773, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 0.010778363794088364, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 10.094572067260742, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 0.01583588682115078, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 9.585827827453613, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 0.04404221847653389, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 9.599539756774902, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 0.03486741706728935, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 17.144325256347656, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 0.0016140751540660858, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 26.5864200592041, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 0.026518141850829124, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 12.708406448364258, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 0.036101553589105606, "pnorm/_forward_module._fsdp_wrapped_module.norm.weight": 21.800535202026367, "gnorm/_forward_module._fsdp_wrapped_module.norm.weight": 0.005103154573589563, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 222.3717041015625, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.02921086736023426} +{"step": 1866465280, "pnorm/_forward_module._fsdp_wrapped_module.emb.weight": 3412.601318359375, "gnorm/_forward_module._fsdp_wrapped_module.emb.weight": 0.0038964804261922836, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 16.280370712280273, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 0.0035398101899772882, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 11.478619575500488, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 0.023115331307053566, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 11.451750755310059, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 0.022746652364730835, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 10.144301414489746, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 0.05639879032969475, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 9.840370178222656, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 0.057863105088472366, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 17.4298038482666, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 0.002138282638043165, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 28.384689331054688, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 0.044971268624067307, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 13.7300443649292, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 0.061004433780908585, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 16.0704288482666, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 0.002272816142067313, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 10.210918426513672, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 0.011196526698768139, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 10.095109939575195, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 0.015387061052024364, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 9.585624694824219, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 0.04614809900522232, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 9.599396705627441, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 0.03539079800248146, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 17.147512435913086, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 0.0013042233185842633, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 26.592788696289062, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 0.02731335535645485, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 12.712468147277832, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 0.03559965267777443, "pnorm/_forward_module._fsdp_wrapped_module.norm.weight": 21.805625915527344, "gnorm/_forward_module._fsdp_wrapped_module.norm.weight": 0.004540056921541691, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 222.39291381835938, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.030384313315153122} +{"step": 1887436800, "pnorm/_forward_module._fsdp_wrapped_module.emb.weight": 3412.505126953125, "gnorm/_forward_module._fsdp_wrapped_module.emb.weight": 0.0042291684076189995, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 16.280826568603516, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 0.0036539731081575155, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 11.479334831237793, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 0.025977639481425285, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 11.452662467956543, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 0.023856662213802338, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 10.143959045410156, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 0.06314080208539963, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 9.839943885803223, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 0.060631949454545975, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 17.431320190429688, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 0.0025623412802815437, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 28.38640022277832, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 0.0462801456451416, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 13.730775833129883, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 0.06142814829945564, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 16.070676803588867, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 0.0024268338456749916, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 10.211328506469727, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 0.011275392025709152, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 10.095389366149902, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 0.016260085627436638, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 9.585482597351074, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 0.052142590284347534, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 9.599249839782715, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 0.03816364333033562, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 17.150300979614258, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 0.0016457195160910487, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 26.598310470581055, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 0.027486266568303108, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 12.715933799743652, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 0.034449171274900436, "pnorm/_forward_module._fsdp_wrapped_module.norm.weight": 21.80988121032715, "gnorm/_forward_module._fsdp_wrapped_module.norm.weight": 0.006071754731237888, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 222.41087341308594, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.029376041144132614} +{"step": 1908408320, "pnorm/_forward_module._fsdp_wrapped_module.emb.weight": 3412.426513671875, "gnorm/_forward_module._fsdp_wrapped_module.emb.weight": 0.0039534009993076324, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 16.28116798400879, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 0.003444765228778124, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 11.479823112487793, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 0.020688556134700775, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 11.453299522399902, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 0.01991122215986252, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 10.143707275390625, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 0.0564630962908268, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 9.839616775512695, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 0.05630066618323326, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 17.43253517150879, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 0.0020966497249901295, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 28.3878116607666, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 0.040356121957302094, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 13.731374740600586, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 0.05532617121934891, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 16.07086944580078, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 0.0020271367393434048, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 10.21160888671875, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 0.011029292829334736, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 10.09557056427002, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 0.014621354639530182, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 9.585342407226562, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 0.04186227172613144, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 9.59912109375, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 0.031987737864255905, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 17.15250587463379, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 0.0017605805769562721, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 26.602779388427734, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 0.025283843278884888, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 12.718700408935547, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 0.032562486827373505, "pnorm/_forward_module._fsdp_wrapped_module.norm.weight": 21.813398361206055, "gnorm/_forward_module._fsdp_wrapped_module.norm.weight": 0.005621515680104494, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 222.42578125, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.029253244400024414} +{"step": 1929379840, "pnorm/_forward_module._fsdp_wrapped_module.emb.weight": 3412.36328125, "gnorm/_forward_module._fsdp_wrapped_module.emb.weight": 0.0041549294255673885, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 16.28142738342285, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 0.003685971489176154, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 11.480189323425293, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 0.018008936196565628, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 11.45374870300293, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 0.018520483747124672, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 10.143476486206055, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 0.06011516973376274, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 9.839330673217773, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 0.06297490000724792, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 17.433494567871094, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 0.0029174559749662876, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 28.388935089111328, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 0.04813358187675476, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 13.731871604919434, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 0.060995399951934814, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 16.07112693786621, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 0.002504906617105007, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 10.212067604064941, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 0.011714830063283443, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 10.095909118652344, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 0.01565367914736271, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 9.585219383239746, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 0.05052110552787781, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 9.599020957946777, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 0.03768495097756386, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 17.15437889099121, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 0.0016397652216255665, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 26.606605529785156, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 0.02904745377600193, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 12.72110366821289, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 0.03888113796710968, "pnorm/_forward_module._fsdp_wrapped_module.norm.weight": 21.816207885742188, "gnorm/_forward_module._fsdp_wrapped_module.norm.weight": 0.0054352907463908195, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 222.43753051757812, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.0302269384264946} +{"step": 1950351360, "pnorm/_forward_module._fsdp_wrapped_module.emb.weight": 3412.31396484375, "gnorm/_forward_module._fsdp_wrapped_module.emb.weight": 0.0036077527329325676, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 16.281673431396484, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 0.00322232604958117, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 11.480551719665527, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 0.019766047596931458, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 11.454187393188477, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 0.021783921867609024, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 10.143294334411621, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 0.05358516424894333, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 9.839105606079102, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 0.05124182999134064, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 17.43433952331543, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 0.0020879011135548353, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 28.389942169189453, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 0.03939773887395859, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 13.73233413696289, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 0.0544944629073143, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 16.07134246826172, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 0.002323621418327093, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 10.212507247924805, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 0.01057353150099516, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 10.096237182617188, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 0.014971323311328888, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 9.585119247436523, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 0.043048374354839325, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 9.5989351272583, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 0.03385341912508011, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 17.155832290649414, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 0.0015235766768455505, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 26.609493255615234, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 0.026346592232584953, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 12.723018646240234, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 0.03613840416073799, "pnorm/_forward_module._fsdp_wrapped_module.norm.weight": 21.81840705871582, "gnorm/_forward_module._fsdp_wrapped_module.norm.weight": 0.005470685660839081, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 222.44642639160156, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.030020082369446754} +{"step": 1971322880, "pnorm/_forward_module._fsdp_wrapped_module.emb.weight": 3412.276611328125, "gnorm/_forward_module._fsdp_wrapped_module.emb.weight": 0.0037478692829608917, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 16.281848907470703, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 0.0033488129265606403, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 11.48081111907959, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 0.019247539341449738, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 11.454526901245117, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 0.020933330059051514, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 10.143135070800781, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 0.051057908684015274, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 9.838932037353516, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 0.05039920285344124, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 17.434951782226562, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 0.0021179006434977055, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 28.39067268371582, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 0.03954504430294037, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 13.73266887664795, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 0.053038131445646286, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 16.071483612060547, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 0.0018249565036967397, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 10.212753295898438, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 0.009769515134394169, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 10.096421241760254, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 0.012485025450587273, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 9.585042953491211, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 0.037027206271886826, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 9.598855018615723, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 0.03134985268115997, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 17.156938552856445, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 0.0016816622810438275, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 26.61174201965332, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 0.02515820600092411, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 12.724452018737793, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 0.03388086333870888, "pnorm/_forward_module._fsdp_wrapped_module.norm.weight": 21.8200626373291, "gnorm/_forward_module._fsdp_wrapped_module.norm.weight": 0.006257243920117617, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 222.45314025878906, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.03058544546365738} +{"step": 1992294400, "pnorm/_forward_module._fsdp_wrapped_module.emb.weight": 3412.249755859375, "gnorm/_forward_module._fsdp_wrapped_module.emb.weight": 0.0034001742023974657, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 16.28196144104004, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 0.0028156740590929985, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 11.480971336364746, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 0.017100678756833076, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 11.45472526550293, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 0.019704949110746384, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 10.143017768859863, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 0.04887061566114426, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 9.838796615600586, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 0.050374552607536316, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 17.4354190826416, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 0.0019675372168421745, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 28.39122200012207, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 0.03986097499728203, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 13.732916831970215, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 0.052176594734191895, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 16.071603775024414, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 0.0018350724130868912, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 10.212969779968262, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 0.009943603537976742, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 10.096577644348145, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 0.012941977940499783, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 9.584986686706543, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 0.03767900541424751, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 9.598803520202637, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 0.030608074739575386, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 17.157732009887695, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 0.0014803953235968947, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 26.613370895385742, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 0.024807285517454147, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 12.725536346435547, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 0.03179767727851868, "pnorm/_forward_module._fsdp_wrapped_module.norm.weight": 21.821260452270508, "gnorm/_forward_module._fsdp_wrapped_module.norm.weight": 0.005156475119292736, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 222.4580078125, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.029250318184494972} +{"step": 2013265920, "pnorm/_forward_module._fsdp_wrapped_module.emb.weight": 3412.2314453125, "gnorm/_forward_module._fsdp_wrapped_module.emb.weight": 0.0035848424304276705, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 16.282033920288086, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 0.0030947839841246605, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 11.481060981750488, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 0.017643803730607033, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 11.454834938049316, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 0.016183413565158844, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 10.142938613891602, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 0.04879993945360184, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 9.838706970214844, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 0.04769742116332054, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 17.435710906982422, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 0.001912868581712246, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 28.391565322875977, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 0.03851291537284851, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 13.73306655883789, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 0.0510735884308815, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 16.071670532226562, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 0.0018335632048547268, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 10.213075637817383, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 0.009935041889548302, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 10.09665298461914, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 0.013325698673725128, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 9.584948539733887, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 0.03528103232383728, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 9.598773002624512, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 0.028452735394239426, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 17.15827751159668, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 0.0018610020633786917, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 26.61448097229004, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 0.023679986596107483, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 12.726243019104004, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 0.030731631442904472, "pnorm/_forward_module._fsdp_wrapped_module.norm.weight": 21.822065353393555, "gnorm/_forward_module._fsdp_wrapped_module.norm.weight": 0.005035300739109516, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 222.4612579345703, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.029489951208233833} +{"step": 2034237440, "pnorm/_forward_module._fsdp_wrapped_module.emb.weight": 3412.220458984375, "gnorm/_forward_module._fsdp_wrapped_module.emb.weight": 0.003706904361024499, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 16.282079696655273, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 0.003202311461791396, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 11.48112964630127, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 0.01666676066815853, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 11.45490837097168, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 0.015647640451788902, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 10.142890930175781, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 0.05225234478712082, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 9.838651657104492, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 0.05247636139392853, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 17.435901641845703, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 0.002000955631956458, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 28.391788482666016, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 0.0397830568253994, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 13.733170509338379, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 0.05496500805020332, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 16.07170295715332, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 0.0020502866245806217, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 10.213120460510254, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 0.010888485237956047, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 10.096685409545898, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 0.01347147673368454, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 9.584925651550293, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 0.0372220017015934, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 9.598756790161133, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 0.029674740508198738, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 17.158626556396484, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 0.0015353038907051086, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 26.615184783935547, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 0.024881919845938683, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 12.726690292358398, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 0.03374282270669937, "pnorm/_forward_module._fsdp_wrapped_module.norm.weight": 21.822559356689453, "gnorm/_forward_module._fsdp_wrapped_module.norm.weight": 0.005367047619074583, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 222.46324157714844, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.028950532898306847} +{"step": 2055208960, "pnorm/_forward_module._fsdp_wrapped_module.emb.weight": 3412.214599609375, "gnorm/_forward_module._fsdp_wrapped_module.emb.weight": 0.0031389608047902584, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 16.2821102142334, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 0.002582780783995986, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 11.481178283691406, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 0.015648413449525833, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 11.454968452453613, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 0.015018824487924576, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 10.142865180969238, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 0.046812936663627625, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 9.838624000549316, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 0.04686308279633522, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 17.435997009277344, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 0.0020293905399739742, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 28.391895294189453, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 0.03861294686794281, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 13.733219146728516, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 0.0510728545486927, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 16.071725845336914, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 0.0018613964784890413, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 10.213157653808594, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 0.0104771563783288, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 10.09671401977539, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 0.012377182953059673, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 9.584914207458496, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 0.03487376123666763, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 9.598746299743652, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 0.028254525735974312, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 17.1588077545166, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 0.0015385085716843605, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 26.615550994873047, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 0.02326105162501335, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 12.726926803588867, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 0.030681820586323738, "pnorm/_forward_module._fsdp_wrapped_module.norm.weight": 21.82281494140625, "gnorm/_forward_module._fsdp_wrapped_module.norm.weight": 0.005587248597294092, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 222.4642791748047, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.029090994969010353} diff --git a/metrics/jsonlines/throughput.jsonl b/metrics/jsonlines/throughput.jsonl index 515c4a5f9a4f5652bf20cf35b1d47fadf3fa3d3e..d9aba26334b90ddbd71242504aa14909ad89a41c 100644 --- a/metrics/jsonlines/throughput.jsonl +++ b/metrics/jsonlines/throughput.jsonl @@ -1,98 +1,98 @@ -{"step": 20971520, "throughput/token_count": 20971520, "throughput/batch_count": 10, "throughput/flop_count": 0, "throughput/total_time": 72.8780845789588, "throughput/update_time": 72.70853926707059, "throughput/token_count_per_second_total_recent": 303184.52690817736, "throughput/token_count_per_second_total_cum": 287761.6792642057, "throughput/token_count_per_second_update_recent": 303769.9932048806, "throughput/token_count_per_second_update_cum": 288432.69595842256, "throughput/batch_count_per_second_total_recent": 0.14456964822205418, "throughput/batch_count_per_second_total_cum": 0.13721546138010296, "throughput/batch_count_per_second_update_recent": 0.14484882030719787, "throughput/batch_count_per_second_update_cum": 0.1375354270736802, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} -{"step": 41943040, "throughput/token_count": 41943040, "throughput/batch_count": 20, "throughput/flop_count": 0, "throughput/total_time": 141.98572698398493, "throughput/update_time": 141.70471469813492, "throughput/token_count_per_second_total_recent": 303330.324193843, "throughput/token_count_per_second_total_cum": 295403.21334362647, "throughput/token_count_per_second_update_recent": 303865.7170326681, "throughput/token_count_per_second_update_cum": 295989.0225907356, "throughput/batch_count_per_second_total_recent": 0.1446391697854247, "throughput/batch_count_per_second_total_cum": 0.1408592287748463, "throughput/batch_count_per_second_update_recent": 0.14489446498521238, "throughput/batch_count_per_second_update_cum": 0.14113856439148692, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} -{"step": 62914560, "throughput/token_count": 62914560, "throughput/batch_count": 30, "throughput/flop_count": 0, "throughput/total_time": 301.07908088195836, "throughput/update_time": 210.67360014707083, "throughput/token_count_per_second_total_recent": 209386.8760613492, "throughput/token_count_per_second_total_cum": 208963.57135043334, "throughput/token_count_per_second_update_recent": 303936.88272447564, "throughput/token_count_per_second_update_cum": 298635.23458126443, "throughput/batch_count_per_second_total_recent": 0.0998434429461237, "throughput/batch_count_per_second_total_cum": 0.09964159553071658, "throughput/batch_count_per_second_update_recent": 0.14492839943145544, "throughput/batch_count_per_second_update_cum": 0.14240037659705373, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} -{"step": 83886080, "throughput/token_count": 83886080, "throughput/batch_count": 40, "throughput/flop_count": 0, "throughput/total_time": 370.1664184979745, "throughput/update_time": 279.6480940769543, "throughput/token_count_per_second_total_recent": 227480.83177567745, "throughput/token_count_per_second_total_cum": 226617.20731011964, "throughput/token_count_per_second_update_recent": 303965.2293772007, "throughput/token_count_per_second_update_cum": 299970.1473985945, "throughput/batch_count_per_second_total_recent": 0.10847131336959717, "throughput/batch_count_per_second_total_cum": 0.1080595051336859, "throughput/batch_count_per_second_update_recent": 0.14494191616878543, "throughput/batch_count_per_second_update_cum": 0.14303691263131832, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} -{"step": 104857600, "throughput/token_count": 104857600, "throughput/batch_count": 50, "throughput/flop_count": 0, "throughput/total_time": 529.2283039629692, "throughput/update_time": 348.5810220290441, "throughput/token_count_per_second_total_recent": 198148.21678820765, "throughput/token_count_per_second_total_cum": 198133.01596835422, "throughput/token_count_per_second_update_recent": 304019.39034162066, "throughput/token_count_per_second_update_cum": 300812.70457478653, "throughput/batch_count_per_second_total_recent": 0.09448443259630568, "throughput/batch_count_per_second_total_cum": 0.09447718428056441, "throughput/batch_count_per_second_update_recent": 0.14496774212914498, "throughput/batch_count_per_second_update_cum": 0.1434386752008374, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} -{"step": 125829120, "throughput/token_count": 125829120, "throughput/batch_count": 60, "throughput/flop_count": 0, "throughput/total_time": 598.2814449759899, "throughput/update_time": 417.52457091695396, "throughput/token_count_per_second_total_recent": 210551.30585254813, "throughput/token_count_per_second_total_cum": 210317.60395820023, "throughput/token_count_per_second_update_recent": 304047.2667521663, "throughput/token_count_per_second_update_cum": 301369.3774324662, "throughput/batch_count_per_second_total_recent": 0.10039868633868605, "throughput/batch_count_per_second_total_cum": 0.10028724859151851, "throughput/batch_count_per_second_update_recent": 0.14498103463753048, "throughput/batch_count_per_second_update_cum": 0.14370411750434217, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} -{"step": 146800640, "throughput/token_count": 146800640, "throughput/batch_count": 70, "throughput/flop_count": 0, "throughput/total_time": 757.2756010189769, "throughput/update_time": 486.45154978276696, "throughput/token_count_per_second_total_recent": 193803.31645371226, "throughput/token_count_per_second_total_cum": 193853.65090657564, "throughput/token_count_per_second_update_recent": 304077.6537591534, "throughput/token_count_per_second_update_cum": 301778.54313663155, "throughput/batch_count_per_second_total_recent": 0.0924126226681291, "throughput/batch_count_per_second_total_cum": 0.09243662400559217, "throughput/batch_count_per_second_update_recent": 0.1449955242915885, "throughput/batch_count_per_second_update_cum": 0.14389922291595056, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} -{"step": 167772160, "throughput/token_count": 167772160, "throughput/batch_count": 80, "throughput/flop_count": 0, "throughput/total_time": 826.3311064429581, "throughput/update_time": 555.3910710238852, "throughput/token_count_per_second_total_recent": 203106.08856174554, "throughput/token_count_per_second_total_cum": 203032.60846877168, "throughput/token_count_per_second_update_recent": 304093.3510318854, "throughput/token_count_per_second_update_cum": 302079.32527742203, "throughput/batch_count_per_second_total_recent": 0.09684853008353497, "throughput/batch_count_per_second_total_cum": 0.09681349204481682, "throughput/batch_count_per_second_update_recent": 0.14500300933450957, "throughput/batch_count_per_second_update_cum": 0.14404264701720335, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} -{"step": 188743680, "throughput/token_count": 188743680, "throughput/batch_count": 90, "throughput/flop_count": 0, "throughput/total_time": 985.3252943049883, "throughput/update_time": 624.3266040377785, "throughput/token_count_per_second_total_recent": 191491.07196848473, "throughput/token_count_per_second_total_cum": 191554.68868089168, "throughput/token_count_per_second_update_recent": 304107.4980542893, "throughput/token_count_per_second_update_cum": 302315.6129809567, "throughput/batch_count_per_second_total_recent": 0.09131005857872235, "throughput/batch_count_per_second_total_cum": 0.0913403933910807, "throughput/batch_count_per_second_update_recent": 0.14500975516046968, "throughput/batch_count_per_second_update_cum": 0.14415531777427515, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} -{"step": 209715200, "throughput/token_count": 209715200, "throughput/batch_count": 100, "throughput/flop_count": 0, "throughput/total_time": 1054.376383124967, "throughput/update_time": 693.2531012066174, "throughput/token_count_per_second_total_recent": 198915.10829269554, "throughput/token_count_per_second_total_cum": 198899.7509394556, "throughput/token_count_per_second_update_recent": 304122.8133150741, "throughput/token_count_per_second_update_cum": 302508.8523008228, "throughput/batch_count_per_second_total_recent": 0.0948501149619558, "throughput/batch_count_per_second_total_cum": 0.09484279200527936, "throughput/batch_count_per_second_update_recent": 0.14501705804589943, "throughput/batch_count_per_second_update_cum": 0.1442474614624132, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} -{"step": 230686720, "throughput/token_count": 230686720, "throughput/batch_count": 110, "throughput/flop_count": 0, "throughput/total_time": 1213.9692550019827, "throughput/update_time": 762.1767162576434, "throughput/token_count_per_second_total_recent": 183057.85747411964, "throughput/token_count_per_second_total_cum": 190026.82238408356, "throughput/token_count_per_second_update_recent": 304174.63426764513, "throughput/token_count_per_second_update_cum": 302668.2855554715, "throughput/batch_count_per_second_total_recent": 0.08728878854471189, "throughput/batch_count_per_second_total_cum": 0.09061184996799639, "throughput/batch_count_per_second_update_recent": 0.1450417682016588, "throughput/batch_count_per_second_update_cum": 0.14432348516248297, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} -{"step": 251658240, "throughput/token_count": 251658240, "throughput/batch_count": 120, "throughput/flop_count": 0, "throughput/total_time": 1283.0319452389958, "throughput/update_time": 831.1140817146515, "throughput/token_count_per_second_total_recent": 198845.76885098737, "throughput/token_count_per_second_total_cum": 196143.39372752138, "throughput/token_count_per_second_update_recent": 304198.81107237784, "throughput/token_count_per_second_update_cum": 302796.26532233687, "throughput/batch_count_per_second_total_recent": 0.09481705133962029, "throughput/batch_count_per_second_total_cum": 0.09352845846534795, "throughput/batch_count_per_second_update_recent": 0.14505329660052196, "throughput/batch_count_per_second_update_cum": 0.14438451067082256, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} -{"step": 272629760, "throughput/token_count": 272629760, "throughput/batch_count": 130, "throughput/flop_count": 0, "throughput/total_time": 1442.136592313007, "throughput/update_time": 900.0497305926983, "throughput/token_count_per_second_total_recent": 183061.82869602673, "throughput/token_count_per_second_total_cum": 189045.72663449022, "throughput/token_count_per_second_update_recent": 304212.1371891679, "throughput/token_count_per_second_update_cum": 302905.2181599661, "throughput/batch_count_per_second_total_recent": 0.08729068217088067, "throughput/batch_count_per_second_total_cum": 0.09014402705883513, "throughput/batch_count_per_second_update_recent": 0.14505965098818202, "throughput/batch_count_per_second_update_cum": 0.14443646343229585, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} -{"step": 293601280, "throughput/token_count": 293601280, "throughput/batch_count": 140, "throughput/flop_count": 0, "throughput/total_time": 1511.2121625279542, "throughput/update_time": 969.0007961746887, "throughput/token_count_per_second_total_recent": 198846.6050728056, "throughput/token_count_per_second_total_cum": 194281.97263107257, "throughput/token_count_per_second_update_recent": 304221.46967557416, "throughput/token_count_per_second_update_cum": 302993.84805362986, "throughput/batch_count_per_second_total_recent": 0.09481745008125572, "throughput/batch_count_per_second_total_cum": 0.09264086371949795, "throughput/batch_count_per_second_update_recent": 0.14506410106447895, "throughput/batch_count_per_second_update_cum": 0.14447872545892232, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} -{"step": 314572800, "throughput/token_count": 314572800, "throughput/batch_count": 150, "throughput/flop_count": 0, "throughput/total_time": 1670.3140526569914, "throughput/update_time": 1037.9416665838216, "throughput/token_count_per_second_total_recent": 183057.33289483035, "throughput/token_count_per_second_total_cum": 188331.52933102896, "throughput/token_count_per_second_update_recent": 304217.9627687801, "throughput/token_count_per_second_update_cum": 303073.679502003, "throughput/batch_count_per_second_total_recent": 0.08728853840581434, "throughput/batch_count_per_second_total_cum": 0.08980347124625633, "throughput/batch_count_per_second_update_recent": 0.1450624288410092, "throughput/batch_count_per_second_update_cum": 0.1445167920598998, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} -{"step": 335544320, "throughput/token_count": 335544320, "throughput/batch_count": 160, "throughput/flop_count": 0, "throughput/total_time": 1739.393865599006, "throughput/update_time": 1106.8950568859, "throughput/token_count_per_second_total_recent": 198822.15650730117, "throughput/token_count_per_second_total_cum": 192908.7635849782, "throughput/token_count_per_second_update_recent": 304213.5003406275, "throughput/token_count_per_second_update_cum": 303140.13773266703, "throughput/batch_count_per_second_total_recent": 0.09480579209675845, "throughput/batch_count_per_second_total_cum": 0.0919860666203395, "throughput/batch_count_per_second_update_recent": 0.14506030098945022, "throughput/batch_count_per_second_update_cum": 0.14454848181374885, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} -{"step": 356515840, "throughput/token_count": 356515840, "throughput/batch_count": 170, "throughput/flop_count": 0, "throughput/total_time": 1898.5316135869944, "throughput/update_time": 1175.8391638009343, "throughput/token_count_per_second_total_recent": 183028.98999619004, "throughput/token_count_per_second_total_cum": 187785.04263429996, "throughput/token_count_per_second_update_recent": 304206.56140502315, "throughput/token_count_per_second_update_cum": 303201.19534677867, "throughput/batch_count_per_second_total_recent": 0.08727502345857145, "throughput/batch_count_per_second_total_cum": 0.08954288608279226, "throughput/batch_count_per_second_update_recent": 0.14505699224711568, "throughput/batch_count_per_second_update_cum": 0.1445775963529485, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} -{"step": 377487360, "throughput/token_count": 377487360, "throughput/batch_count": 180, "throughput/flop_count": 0, "throughput/total_time": 1967.6012063919916, "throughput/update_time": 1244.7901415458764, "throughput/token_count_per_second_total_recent": 198786.5287437099, "throughput/token_count_per_second_total_cum": 191851.559540463, "throughput/token_count_per_second_update_recent": 304202.3660841232, "throughput/token_count_per_second_update_cum": 303253.81556380825, "throughput/batch_count_per_second_total_recent": 0.09478880345521445, "throughput/batch_count_per_second_total_cum": 0.09148195244811201, "throughput/batch_count_per_second_update_recent": 0.14505499176222, "throughput/batch_count_per_second_update_cum": 0.14460268762770093, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} -{"step": 398458880, "throughput/token_count": 398458880, "throughput/batch_count": 190, "throughput/flop_count": 0, "throughput/total_time": 2127.1002123509534, "throughput/update_time": 1313.7616025957977, "throughput/token_count_per_second_total_recent": 182945.27217703668, "throughput/token_count_per_second_total_cum": 187324.9213583632, "throughput/token_count_per_second_update_recent": 304183.1123605812, "throughput/token_count_per_second_update_cum": 303296.18342681386, "throughput/batch_count_per_second_total_recent": 0.0872351036915954, "throughput/batch_count_per_second_total_cum": 0.08932348316114579, "throughput/batch_count_per_second_update_recent": 0.1450458108714014, "throughput/batch_count_per_second_update_cum": 0.14462289019909566, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} -{"step": 419430400, "throughput/token_count": 419430400, "throughput/batch_count": 200, "throughput/flop_count": 0, "throughput/total_time": 2196.1893687059637, "throughput/update_time": 1382.727176492801, "throughput/token_count_per_second_total_recent": 198802.04944741927, "throughput/token_count_per_second_total_cum": 190980.98095572527, "throughput/token_count_per_second_update_recent": 304166.6142106099, "throughput/token_count_per_second_update_cum": 303335.6161147121, "throughput/batch_count_per_second_total_recent": 0.09479620430346454, "throughput/batch_count_per_second_total_cum": 0.09106682822977318, "throughput/batch_count_per_second_update_recent": 0.1450379439404535, "throughput/batch_count_per_second_update_cum": 0.14464169316993336, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} -{"step": 440401920, "throughput/token_count": 440401920, "throughput/batch_count": 210, "throughput/flop_count": 0, "throughput/total_time": 2356.148066883965, "throughput/update_time": 1451.6794419176877, "throughput/token_count_per_second_total_recent": 182881.337945766, "throughput/token_count_per_second_total_cum": 186916.06278481343, "throughput/token_count_per_second_update_recent": 304155.44790793996, "throughput/token_count_per_second_update_cum": 303374.08334323677, "throughput/batch_count_per_second_total_recent": 0.08720461747444438, "throughput/batch_count_per_second_total_cum": 0.08912852420082733, "throughput/batch_count_per_second_update_recent": 0.1450326194324207, "throughput/batch_count_per_second_update_cum": 0.14466003577386702, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} -{"step": 461373440, "throughput/token_count": 461373440, "throughput/batch_count": 220, "throughput/flop_count": 0, "throughput/total_time": 2425.217230288952, "throughput/update_time": 1520.627368493646, "throughput/token_count_per_second_total_recent": 198635.85056831336, "throughput/token_count_per_second_total_cum": 190240.04705138505, "throughput/token_count_per_second_update_recent": 304150.01829736284, "throughput/token_count_per_second_update_cum": 303409.92774386454, "throughput/batch_count_per_second_total_recent": 0.09471695450225513, "throughput/batch_count_per_second_total_cum": 0.09071352341241123, "throughput/batch_count_per_second_update_recent": 0.14503003039234297, "throughput/batch_count_per_second_update_cum": 0.14467712771599986, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} -{"step": 482344960, "throughput/token_count": 482344960, "throughput/batch_count": 230, "throughput/flop_count": 0, "throughput/total_time": 2584.4522528109956, "throughput/update_time": 1589.588658401568, "throughput/token_count_per_second_total_recent": 182858.9462582528, "throughput/token_count_per_second_total_cum": 186633.34154282577, "throughput/token_count_per_second_update_recent": 304138.98511156306, "throughput/token_count_per_second_update_cum": 303440.1116607289, "throughput/batch_count_per_second_total_recent": 0.08719394028580323, "throughput/batch_count_per_second_total_cum": 0.08899371220723427, "throughput/batch_count_per_second_update_recent": 0.14502476935938027, "throughput/batch_count_per_second_update_cum": 0.1446915205291409, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} -{"step": 503316480, "throughput/token_count": 503316480, "throughput/batch_count": 240, "throughput/flop_count": 0, "throughput/total_time": 2653.526103938988, "throughput/update_time": 1658.537395758729, "throughput/token_count_per_second_total_recent": 198610.43917825838, "throughput/token_count_per_second_total_cum": 189678.36014609362, "throughput/token_count_per_second_update_recent": 304140.2175378249, "throughput/token_count_per_second_update_cum": 303470.08230691624, "throughput/batch_count_per_second_total_recent": 0.09470483740723533, "throughput/batch_count_per_second_total_cum": 0.09044569022469216, "throughput/batch_count_per_second_update_recent": 0.14502535702601665, "throughput/batch_count_per_second_update_cum": 0.14470581164689839, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} -{"step": 524288000, "throughput/token_count": 524288000, "throughput/batch_count": 250, "throughput/flop_count": 0, "throughput/total_time": 2812.5629005369847, "throughput/update_time": 1727.488953433698, "throughput/token_count_per_second_total_recent": 182870.16172341918, "throughput/token_count_per_second_total_cum": 186409.34213414427, "throughput/token_count_per_second_update_recent": 304135.550068268, "throughput/token_count_per_second_update_cum": 303497.1650370802, "throughput/batch_count_per_second_total_recent": 0.08719928823634109, "throughput/batch_count_per_second_total_cum": 0.08888690096575941, "throughput/batch_count_per_second_update_recent": 0.14502313140309717, "throughput/batch_count_per_second_update_cum": 0.14471872569898614, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} -{"step": 545259520, "throughput/token_count": 545259520, "throughput/batch_count": 260, "throughput/flop_count": 0, "throughput/total_time": 2881.6501679039793, "throughput/update_time": 1796.4439407095779, "throughput/token_count_per_second_total_recent": 198626.0630295748, "throughput/token_count_per_second_total_cum": 189217.80515662124, "throughput/token_count_per_second_update_recent": 304134.7160661683, "throughput/token_count_per_second_update_cum": 303521.58931529353, "throughput/batch_count_per_second_total_recent": 0.09471228744009724, "throughput/batch_count_per_second_total_cum": 0.09022608049231588, "throughput/batch_count_per_second_update_recent": 0.14502273371990598, "throughput/batch_count_per_second_update_cum": 0.14473037210240056, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} -{"step": 566231040, "throughput/token_count": 566231040, "throughput/batch_count": 270, "throughput/flop_count": 0, "throughput/total_time": 3040.8488886230043, "throughput/update_time": 1865.3935032716836, "throughput/token_count_per_second_total_recent": 182858.70351484453, "throughput/token_count_per_second_total_cum": 186208.21380453664, "throughput/token_count_per_second_update_recent": 304132.34518659924, "throughput/token_count_per_second_update_cum": 303545.09062398714, "throughput/batch_count_per_second_total_recent": 0.08719382453672625, "throughput/batch_count_per_second_total_cum": 0.08879099550463516, "throughput/batch_count_per_second_update_recent": 0.14502160319642984, "throughput/batch_count_per_second_update_cum": 0.14474157839965207, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} -{"step": 587202560, "throughput/token_count": 587202560, "throughput/batch_count": 280, "throughput/flop_count": 0, "throughput/total_time": 3109.932256244996, "throughput/update_time": 1934.345054808713, "throughput/token_count_per_second_total_recent": 198674.55591326, "throughput/token_count_per_second_total_cum": 188815.22541876906, "throughput/token_count_per_second_update_recent": 304132.7124115641, "throughput/token_count_per_second_update_cum": 303566.6043864487, "throughput/batch_count_per_second_total_recent": 0.09473541064894676, "throughput/batch_count_per_second_total_cum": 0.09003411551416829, "throughput/batch_count_per_second_update_recent": 0.1450217783029385, "throughput/batch_count_per_second_update_cum": 0.1447518369610065, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} -{"step": 608174080, "throughput/token_count": 608174080, "throughput/batch_count": 290, "throughput/flop_count": 0, "throughput/total_time": 3268.990683123993, "throughput/update_time": 2003.3044974627555, "throughput/token_count_per_second_total_recent": 182928.15125267982, "throughput/token_count_per_second_total_cum": 186043.3812612772, "throughput/token_count_per_second_update_recent": 304138.67168085004, "throughput/token_count_per_second_update_cum": 303585.44133968174, "throughput/batch_count_per_second_total_recent": 0.08722693979867926, "throughput/batch_count_per_second_total_cum": 0.08871239722312793, "throughput/batch_count_per_second_update_recent": 0.14502461990396978, "throughput/batch_count_per_second_update_cum": 0.14476081912025535, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} -{"step": 629145600, "throughput/token_count": 629145600, "throughput/batch_count": 300, "throughput/flop_count": 0, "throughput/total_time": 3338.075166533992, "throughput/update_time": 2072.2588375147316, "throughput/token_count_per_second_total_recent": 198852.5024293055, "throughput/token_count_per_second_total_cum": 188475.56409380015, "throughput/token_count_per_second_update_recent": 304141.607241635, "throughput/token_count_per_second_update_cum": 303603.772178642, "throughput/batch_count_per_second_total_recent": 0.09482026215997004, "throughput/batch_count_per_second_total_cum": 0.0898721523732186, "throughput/batch_count_per_second_update_recent": 0.1450260196884322, "throughput/batch_count_per_second_update_cum": 0.14476955994541263, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} -{"step": 650117120, "throughput/token_count": 650117120, "throughput/batch_count": 310, "throughput/flop_count": 0, "throughput/total_time": 3497.58566847397, "throughput/update_time": 2141.20914719766, "throughput/token_count_per_second_total_recent": 183001.3686918585, "throughput/token_count_per_second_total_cum": 185875.96748806792, "throughput/token_count_per_second_update_recent": 304143.5676381246, "throughput/token_count_per_second_update_cum": 303621.4938885585, "throughput/batch_count_per_second_total_recent": 0.08726185259430813, "throughput/batch_count_per_second_total_cum": 0.08863256811526676, "throughput/batch_count_per_second_update_recent": 0.14502695447832326, "throughput/batch_count_per_second_update_cum": 0.14477801031520773, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} -{"step": 671088640, "throughput/token_count": 671088640, "throughput/batch_count": 320, "throughput/flop_count": 0, "throughput/total_time": 3566.6879564279807, "throughput/update_time": 2210.160061070579, "throughput/token_count_per_second_total_recent": 198791.12233613819, "throughput/token_count_per_second_total_cum": 188154.57034601137, "throughput/token_count_per_second_update_recent": 304141.88347302657, "throughput/token_count_per_second_update_cum": 303638.02686531737, "throughput/batch_count_per_second_total_recent": 0.0947909938507739, "throughput/batch_count_per_second_total_cum": 0.08971909062672204, "throughput/batch_count_per_second_update_recent": 0.14502615140582398, "throughput/batch_count_per_second_update_cum": 0.14478589385286206, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} -{"step": 692060160, "throughput/token_count": 692060160, "throughput/batch_count": 330, "throughput/flop_count": 0, "throughput/total_time": 3725.7928462160053, "throughput/update_time": 2279.117056379677, "throughput/token_count_per_second_total_recent": 183016.1621198946, "throughput/token_count_per_second_total_cum": 185748.42686245186, "throughput/token_count_per_second_update_recent": 304143.5386557871, "throughput/token_count_per_second_update_cum": 303652.7492358471, "throughput/batch_count_per_second_total_recent": 0.08726890665049296, "throughput/batch_count_per_second_total_cum": 0.08857175200579255, "throughput/batch_count_per_second_update_recent": 0.14502694065846783, "throughput/batch_count_per_second_update_cum": 0.14479291402618746, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} -{"step": 713031680, "throughput/token_count": 713031680, "throughput/batch_count": 340, "throughput/flop_count": 0, "throughput/total_time": 3794.8868216549745, "throughput/update_time": 2348.075175291684, "throughput/token_count_per_second_total_recent": 198774.4938932409, "throughput/token_count_per_second_total_cum": 187892.73923300888, "throughput/token_count_per_second_update_recent": 304140.6648394944, "throughput/token_count_per_second_update_cum": 303666.4615780137, "throughput/batch_count_per_second_total_recent": 0.09478306479131741, "throughput/batch_count_per_second_total_cum": 0.08959423982286877, "throughput/batch_count_per_second_update_recent": 0.1450255703160736, "throughput/batch_count_per_second_update_cum": 0.1447994525804585, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} -{"step": 734003200, "throughput/token_count": 734003200, "throughput/batch_count": 350, "throughput/flop_count": 0, "throughput/total_time": 3954.2480487469584, "throughput/update_time": 2417.024196324637, "throughput/token_count_per_second_total_recent": 182961.51059960516, "throughput/token_count_per_second_total_cum": 185623.9646454639, "throughput/token_count_per_second_update_recent": 304141.6229259774, "throughput/token_count_per_second_update_cum": 303680.53456648724, "throughput/batch_count_per_second_total_recent": 0.08724284677486666, "throughput/batch_count_per_second_total_cum": 0.08851240379594034, "throughput/batch_count_per_second_update_recent": 0.14502602716730947, "throughput/batch_count_per_second_update_cum": 0.14480616310428965, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} -{"step": 754974720, "throughput/token_count": 754974720, "throughput/batch_count": 360, "throughput/flop_count": 0, "throughput/total_time": 4023.3350287670037, "throughput/update_time": 2485.9809508775943, "throughput/token_count_per_second_total_recent": 198742.54887023245, "throughput/token_count_per_second_total_cum": 187648.98140520268, "throughput/token_count_per_second_update_recent": 304138.69621489994, "throughput/token_count_per_second_update_cum": 303692.8821733251, "throughput/batch_count_per_second_total_recent": 0.09476783221732733, "throughput/batch_count_per_second_total_cum": 0.08947800703296789, "throughput/batch_count_per_second_update_recent": 0.14502463160271642, "throughput/batch_count_per_second_update_cum": 0.14481205090204483, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} -{"step": 775946240, "throughput/token_count": 775946240, "throughput/batch_count": 370, "throughput/flop_count": 0, "throughput/total_time": 4182.707546444959, "throughput/update_time": 2554.9301924086176, "throughput/token_count_per_second_total_recent": 182932.6565009825, "throughput/token_count_per_second_total_cum": 185512.90793914243, "throughput/token_count_per_second_update_recent": 304139.65267526807, "throughput/token_count_per_second_update_cum": 303705.4563390985, "throughput/batch_count_per_second_total_recent": 0.08722908806847691, "throughput/batch_count_per_second_total_cum": 0.08845944783169862, "throughput/batch_count_per_second_update_recent": 0.1450250876785603, "throughput/batch_count_per_second_update_cum": 0.144818046731519, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} -{"step": 796917760, "throughput/token_count": 796917760, "throughput/batch_count": 380, "throughput/flop_count": 0, "throughput/total_time": 4251.801609379996, "throughput/update_time": 2623.8855139956577, "throughput/token_count_per_second_total_recent": 198681.4267957253, "throughput/token_count_per_second_total_cum": 187430.60782561012, "throughput/token_count_per_second_update_recent": 304138.139813683, "throughput/token_count_per_second_update_cum": 303716.6658946381, "throughput/batch_count_per_second_total_recent": 0.09473868694101586, "throughput/batch_count_per_second_total_cum": 0.08937387839584833, "throughput/batch_count_per_second_update_recent": 0.1450243662899413, "throughput/batch_count_per_second_update_cum": 0.1448233918641272, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} -{"step": 817889280, "throughput/token_count": 817889280, "throughput/batch_count": 390, "throughput/flop_count": 0, "throughput/total_time": 4410.926594229008, "throughput/update_time": 2692.833996849775, "throughput/token_count_per_second_total_recent": 182920.10508841762, "throughput/token_count_per_second_total_cum": 185423.46206125428, "throughput/token_count_per_second_update_recent": 304142.60140023567, "throughput/token_count_per_second_update_cum": 303728.07271328714, "throughput/batch_count_per_second_total_recent": 0.08722310308857804, "throughput/batch_count_per_second_total_cum": 0.08841679671347345, "throughput/batch_count_per_second_update_recent": 0.14502649374019416, "throughput/batch_count_per_second_update_cum": 0.14482883105911595, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} -{"step": 838860800, "throughput/token_count": 838860800, "throughput/batch_count": 400, "throughput/flop_count": 0, "throughput/total_time": 4480.02484513697, "throughput/update_time": 2761.798597707704, "throughput/token_count_per_second_total_recent": 198751.25565572112, "throughput/token_count_per_second_total_cum": 187244.67586615653, "throughput/token_count_per_second_update_recent": 304138.86972494656, "throughput/token_count_per_second_update_cum": 303737.137348196, "throughput/batch_count_per_second_total_recent": 0.09477198393617683, "throughput/batch_count_per_second_total_cum": 0.08928521912868334, "throughput/batch_count_per_second_update_recent": 0.14502471433875397, "throughput/batch_count_per_second_update_cum": 0.14483315341386604, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} -{"step": 859832320, "throughput/token_count": 859832320, "throughput/batch_count": 410, "throughput/flop_count": 0, "throughput/total_time": 4639.57663258299, "throughput/update_time": 2830.754078882688, "throughput/token_count_per_second_total_recent": 182911.33274657672, "throughput/token_count_per_second_total_cum": 185325.59931471717, "throughput/token_count_per_second_update_recent": 304136.171680192, "throughput/token_count_per_second_update_cum": 303746.7388687398, "throughput/batch_count_per_second_total_recent": 0.08721892011002384, "throughput/batch_count_per_second_total_cum": 0.08837013211952074, "throughput/batch_count_per_second_update_recent": 0.1450234278107605, "throughput/batch_count_per_second_update_cum": 0.14483773177563658, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} -{"step": 880803840, "throughput/token_count": 880803840, "throughput/batch_count": 420, "throughput/flop_count": 0, "throughput/total_time": 4708.671424973989, "throughput/update_time": 2899.7122086867457, "throughput/token_count_per_second_total_recent": 198665.87564221423, "throughput/token_count_per_second_total_cum": 187059.949719228, "throughput/token_count_per_second_update_recent": 304133.5990933948, "throughput/token_count_per_second_update_cum": 303755.6062844279, "throughput/batch_count_per_second_total_recent": 0.09473127157316887, "throughput/batch_count_per_second_total_cum": 0.08919713483773613, "throughput/batch_count_per_second_update_recent": 0.1450222011057829, "throughput/batch_count_per_second_update_cum": 0.1448419600889339, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} -{"step": 901775360, "throughput/token_count": 901775360, "throughput/batch_count": 430, "throughput/flop_count": 0, "throughput/total_time": 4867.7282030819915, "throughput/update_time": 2968.665082120744, "throughput/token_count_per_second_total_recent": 182920.6451486221, "throughput/token_count_per_second_total_cum": 185255.89810643965, "throughput/token_count_per_second_update_recent": 304135.1417802123, "throughput/token_count_per_second_update_cum": 303764.5995943042, "throughput/batch_count_per_second_total_recent": 0.08722336060935121, "throughput/batch_count_per_second_total_cum": 0.08833689599344237, "throughput/batch_count_per_second_update_recent": 0.14502293671618094, "throughput/batch_count_per_second_update_cum": 0.14484624843325816, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} -{"step": 922746880, "throughput/token_count": 922746880, "throughput/batch_count": 440, "throughput/flop_count": 0, "throughput/total_time": 4936.824467270984, "throughput/update_time": 3037.629162015859, "throughput/token_count_per_second_total_recent": 198725.75980505077, "throughput/token_count_per_second_total_cum": 186911.01660944472, "throughput/token_count_per_second_update_recent": 304132.59824274207, "throughput/token_count_per_second_update_cum": 303772.06393016, "throughput/batch_count_per_second_total_recent": 0.09475982656719721, "throughput/batch_count_per_second_total_cum": 0.08912611799690472, "throughput/batch_count_per_second_update_recent": 0.14502172386300186, "throughput/batch_count_per_second_update_cum": 0.1448498077059555, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} -{"step": 943718400, "throughput/token_count": 943718400, "throughput/batch_count": 450, "throughput/flop_count": 0, "throughput/total_time": 5096.086593801971, "throughput/update_time": 3106.58031973982, "throughput/token_count_per_second_total_recent": 182936.24300957212, "throughput/token_count_per_second_total_cum": 185184.92231819246, "throughput/token_count_per_second_update_recent": 304130.896932557, "throughput/token_count_per_second_update_cum": 303780.4604643338, "throughput/batch_count_per_second_total_recent": 0.08723079824904066, "throughput/batch_count_per_second_total_cum": 0.08830305210027335, "throughput/batch_count_per_second_update_recent": 0.14502091261508798, "throughput/batch_count_per_second_update_cum": 0.1448538114854497, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} -{"step": 964689920, "throughput/token_count": 964689920, "throughput/batch_count": 460, "throughput/flop_count": 0, "throughput/total_time": 5165.18791128695, "throughput/update_time": 3175.5471031158813, "throughput/token_count_per_second_total_recent": 198744.22444971412, "throughput/token_count_per_second_total_cum": 186767.6329629679, "throughput/token_count_per_second_update_recent": 304126.36942010815, "throughput/token_count_per_second_update_cum": 303786.9975392384, "throughput/batch_count_per_second_total_recent": 0.09476863119588572, "throughput/batch_count_per_second_total_cum": 0.08905774734638591, "throughput/batch_count_per_second_update_recent": 0.14501875372891815, "throughput/batch_count_per_second_update_cum": 0.14485692860567018, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} -{"step": 985661440, "throughput/token_count": 985661440, "throughput/batch_count": 470, "throughput/flop_count": 0, "throughput/total_time": 5324.26284656598, "throughput/update_time": 3244.498165418743, "throughput/token_count_per_second_total_recent": 182982.483462679, "throughput/token_count_per_second_total_cum": 185126.3674248779, "throughput/token_count_per_second_update_recent": 304127.3918953004, "throughput/token_count_per_second_update_cum": 303794.728721256, "throughput/batch_count_per_second_total_recent": 0.08725284741529417, "throughput/batch_count_per_second_total_cum": 0.08827513095134636, "throughput/batch_count_per_second_update_recent": 0.14501924128308316, "throughput/batch_count_per_second_update_cum": 0.144860615120533, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} -{"step": 1006632960, "throughput/token_count": 1006632960, "throughput/batch_count": 480, "throughput/flop_count": 0, "throughput/total_time": 5393.351182107988, "throughput/update_time": 3313.452428144694, "throughput/token_count_per_second_total_recent": 198756.00801706532, "throughput/token_count_per_second_total_cum": 186643.3180430424, "throughput/token_count_per_second_update_recent": 304126.7615478465, "throughput/token_count_per_second_update_cum": 303801.84470119147, "throughput/batch_count_per_second_total_recent": 0.09477425003865496, "throughput/batch_count_per_second_total_cum": 0.08899846937324639, "throughput/batch_count_per_second_update_recent": 0.14501894070999455, "throughput/batch_count_per_second_update_cum": 0.14486400828418325, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} -{"step": 1027604480, "throughput/token_count": 1027604480, "throughput/batch_count": 490, "throughput/flop_count": 0, "throughput/total_time": 5552.3678305439535, "throughput/update_time": 3382.4195148196886, "throughput/token_count_per_second_total_recent": 183001.54615708414, "throughput/token_count_per_second_total_cum": 185075.00067756278, "throughput/token_count_per_second_update_recent": 304122.4017147204, "throughput/token_count_per_second_update_cum": 303807.518700051, "throughput/batch_count_per_second_total_recent": 0.08726193721632201, "throughput/batch_count_per_second_total_cum": 0.08825063737753047, "throughput/batch_count_per_second_update_recent": 0.14501686177955647, "throughput/batch_count_per_second_update_cum": 0.14486671385767508, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} -{"step": 1048576000, "throughput/token_count": 1048576000, "throughput/batch_count": 500, "throughput/flop_count": 0, "throughput/total_time": 5621.47650762595, "throughput/update_time": 3451.396044731722, "throughput/token_count_per_second_total_recent": 198853.9855978587, "throughput/token_count_per_second_total_cum": 186530.35347164198, "throughput/token_count_per_second_update_recent": 304113.8848092816, "throughput/token_count_per_second_update_cum": 303812.13468693825, "throughput/batch_count_per_second_total_recent": 0.09482096938984809, "throughput/batch_count_per_second_total_cum": 0.08894460366804217, "throughput/batch_count_per_second_update_recent": 0.14501280060257035, "throughput/batch_count_per_second_update_cum": 0.1448689149317447, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} -{"step": 1069547520, "throughput/token_count": 1069547520, "throughput/batch_count": 510, "throughput/flop_count": 0, "throughput/total_time": 5780.910811659007, "throughput/update_time": 3520.3645316287293, "throughput/token_count_per_second_total_recent": 183017.3090454295, "throughput/token_count_per_second_total_cum": 185013.66909915378, "throughput/token_count_per_second_update_recent": 304106.72237667075, "throughput/token_count_per_second_update_cum": 303817.26391987136, "throughput/batch_count_per_second_total_recent": 0.08726945354720568, "throughput/batch_count_per_second_total_cum": 0.08822139220197381, "throughput/batch_count_per_second_update_recent": 0.14500938528855836, "throughput/batch_count_per_second_update_cum": 0.14487136074060028, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} -{"step": 1090519040, "throughput/token_count": 1090519040, "throughput/batch_count": 520, "throughput/flop_count": 0, "throughput/total_time": 5850.004313221958, "throughput/update_time": 3589.3208626466803, "throughput/token_count_per_second_total_recent": 198782.16111184834, "throughput/token_count_per_second_total_cum": 186413.37366799032, "throughput/token_count_per_second_update_recent": 304107.27656987915, "throughput/token_count_per_second_update_cum": 303823.2249863215, "throughput/batch_count_per_second_total_recent": 0.09478672080604951, "throughput/batch_count_per_second_total_cum": 0.08888882335090176, "throughput/batch_count_per_second_update_recent": 0.14500964954847295, "throughput/batch_count_per_second_update_cum": 0.14487420319858624, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} -{"step": 1111490560, "throughput/token_count": 1111490560, "throughput/batch_count": 530, "throughput/flop_count": 0, "throughput/total_time": 6009.048461712955, "throughput/update_time": 3658.2837809736375, "throughput/token_count_per_second_total_recent": 183019.11782553338, "throughput/token_count_per_second_total_cum": 184969.47845934922, "throughput/token_count_per_second_update_recent": 304102.75801448023, "throughput/token_count_per_second_update_cum": 303828.4142364104, "throughput/batch_count_per_second_total_recent": 0.08727031604077023, "throughput/batch_count_per_second_total_cum": 0.08820032046287023, "throughput/batch_count_per_second_update_recent": 0.14500749493335735, "throughput/batch_count_per_second_update_cum": 0.14487667762585182, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} -{"step": 1132462080, "throughput/token_count": 1132462080, "throughput/batch_count": 540, "throughput/flop_count": 0, "throughput/total_time": 6078.168775815982, "throughput/update_time": 3727.2589657856734, "throughput/token_count_per_second_total_recent": 198820.90746422115, "throughput/token_count_per_second_total_cum": 186316.32680321042, "throughput/token_count_per_second_update_recent": 304099.311140528, "throughput/token_count_per_second_update_cum": 303832.4115376531, "throughput/batch_count_per_second_total_recent": 0.0948051965066057, "throughput/batch_count_per_second_total_cum": 0.08884254779968759, "throughput/batch_count_per_second_update_recent": 0.14500585133577729, "throughput/batch_count_per_second_update_cum": 0.14487858368761689, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} -{"step": 1153433600, "throughput/token_count": 1153433600, "throughput/batch_count": 550, "throughput/flop_count": 0, "throughput/total_time": 6237.341601410008, "throughput/update_time": 3796.2288036436657, "throughput/token_count_per_second_total_recent": 183030.9363246714, "throughput/token_count_per_second_total_cum": 184923.91049085013, "throughput/token_count_per_second_update_recent": 304091.6898752834, "throughput/token_count_per_second_update_cum": 303836.6915326391, "throughput/batch_count_per_second_total_recent": 0.08727595154031344, "throughput/batch_count_per_second_total_cum": 0.08817859196226603, "throughput/batch_count_per_second_update_recent": 0.14500221723331613, "throughput/batch_count_per_second_update_cum": 0.14488062454826311, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} -{"step": 1174405120, "throughput/token_count": 1174405120, "throughput/batch_count": 560, "throughput/flop_count": 0, "throughput/total_time": 6306.462939933001, "throughput/update_time": 3865.200992291735, "throughput/token_count_per_second_total_recent": 198798.2710563915, "throughput/token_count_per_second_total_cum": 186222.47227738038, "throughput/token_count_per_second_update_recent": 304088.01713739004, "throughput/token_count_per_second_update_cum": 303840.6339908543, "throughput/batch_count_per_second_total_recent": 0.09479440262622428, "throughput/batch_count_per_second_total_cum": 0.08879779447430629, "throughput/batch_count_per_second_update_recent": 0.14500046593541624, "throughput/batch_count_per_second_update_cum": 0.1448825044588348, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} -{"step": 1195376640, "throughput/token_count": 1195376640, "throughput/batch_count": 570, "throughput/flop_count": 0, "throughput/total_time": 6465.5906408529845, "throughput/update_time": 3934.16611681378, "throughput/token_count_per_second_total_recent": 183018.13276400758, "throughput/token_count_per_second_total_cum": 184882.82144665715, "throughput/token_count_per_second_update_recent": 304081.1863503552, "throughput/token_count_per_second_update_cum": 303844.9837924274, "throughput/batch_count_per_second_total_recent": 0.08726984632683162, "throughput/batch_count_per_second_total_cum": 0.08815899917919977, "throughput/batch_count_per_second_update_recent": 0.14499720876233826, "throughput/batch_count_per_second_update_cum": 0.14488457860585566, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} -{"step": 1216348160, "throughput/token_count": 1216348160, "throughput/batch_count": 580, "throughput/flop_count": 0, "throughput/total_time": 6534.709841756965, "throughput/update_time": 4003.142038117745, "throughput/token_count_per_second_total_recent": 198766.85321651908, "throughput/token_count_per_second_total_cum": 186136.52165969234, "throughput/token_count_per_second_update_recent": 304073.22831041255, "throughput/token_count_per_second_update_cum": 303848.364214406, "throughput/batch_count_per_second_total_recent": 0.09477942143274264, "throughput/batch_count_per_second_total_cum": 0.08875681002602212, "throughput/batch_count_per_second_update_recent": 0.14499341407318714, "throughput/batch_count_per_second_update_cum": 0.14488619051666546, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} -{"step": 1237319680, "throughput/token_count": 1237319680, "throughput/batch_count": 590, "throughput/flop_count": 0, "throughput/total_time": 6694.0332092359895, "throughput/update_time": 4072.1044038116815, "throughput/token_count_per_second_total_recent": 182964.6548258954, "throughput/token_count_per_second_total_cum": 184839.19056344495, "throughput/token_count_per_second_update_recent": 304075.3273669953, "throughput/token_count_per_second_update_cum": 303852.6416075704, "throughput/batch_count_per_second_total_recent": 0.08724434605879565, "throughput/batch_count_per_second_total_cum": 0.08813819435283897, "throughput/batch_count_per_second_update_recent": 0.14499441498136295, "throughput/batch_count_per_second_update_cum": 0.1448882301366665, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} -{"step": 1258291200, "throughput/token_count": 1258291200, "throughput/batch_count": 600, "throughput/flop_count": 0, "throughput/total_time": 6763.143532235001, "throughput/update_time": 4141.075749416603, "throughput/token_count_per_second_total_recent": 198788.70779254206, "throughput/token_count_per_second_total_cum": 186051.2340160516, "throughput/token_count_per_second_update_recent": 304076.5618298096, "throughput/token_count_per_second_update_cum": 303856.1176228831, "throughput/batch_count_per_second_total_recent": 0.09478984250666717, "throughput/batch_count_per_second_total_cum": 0.08871614170839863, "throughput/batch_count_per_second_update_recent": 0.14499500361910325, "throughput/batch_count_per_second_update_cum": 0.14488988762993008, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} -{"step": 1279262720, "throughput/token_count": 1279262720, "throughput/batch_count": 610, "throughput/flop_count": 0, "throughput/total_time": 6922.65050104697, "throughput/update_time": 4210.039972436498, "throughput/token_count_per_second_total_recent": 182952.43118967788, "throughput/token_count_per_second_total_cum": 184793.77513085867, "throughput/token_count_per_second_update_recent": 304079.6145091415, "throughput/token_count_per_second_update_cum": 303859.9938184543, "throughput/batch_count_per_second_total_recent": 0.08723851737483877, "throughput/batch_count_per_second_total_cum": 0.08811653858702596, "throughput/batch_count_per_second_update_recent": 0.14499645925004076, "throughput/batch_count_per_second_update_cum": 0.1448917359440109, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} -{"step": 1300234240, "throughput/token_count": 1300234240, "throughput/batch_count": 620, "throughput/flop_count": 0, "throughput/total_time": 6991.763516802981, "throughput/update_time": 4279.012207661464, "throughput/token_count_per_second_total_recent": 198697.126128854, "throughput/token_count_per_second_total_cum": 185966.5643546449, "throughput/token_count_per_second_update_recent": 304071.0877529559, "throughput/token_count_per_second_update_cum": 303863.17610217683, "throughput/batch_count_per_second_total_recent": 0.09474617296641064, "throughput/batch_count_per_second_total_cum": 0.08867576806766744, "throughput/batch_count_per_second_update_recent": 0.14499239337585254, "throughput/batch_count_per_second_update_cum": 0.14489325337513773, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} -{"step": 1321205760, "throughput/token_count": 1321205760, "throughput/batch_count": 630, "throughput/flop_count": 0, "throughput/total_time": 7150.874404011003, "throughput/update_time": 4347.981613874435, "throughput/token_count_per_second_total_recent": 182938.98259403164, "throughput/token_count_per_second_total_cum": 184761.4271142731, "throughput/token_count_per_second_update_recent": 304069.97284424433, "throughput/token_count_per_second_update_cum": 303866.4551349584, "throughput/batch_count_per_second_total_recent": 0.087232104584709, "throughput/batch_count_per_second_total_cum": 0.08810111385072379, "throughput/batch_count_per_second_update_recent": 0.14499186174595086, "throughput/batch_count_per_second_update_cum": 0.14489481693981093, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} -{"step": 1342177280, "throughput/token_count": 1342177280, "throughput/batch_count": 640, "throughput/flop_count": 0, "throughput/total_time": 7219.974368761992, "throughput/update_time": 4416.951972602401, "throughput/token_count_per_second_total_recent": 198709.65969068484, "throughput/token_count_per_second_total_cum": 185897.79013718895, "throughput/token_count_per_second_update_recent": 304070.47578925313, "throughput/token_count_per_second_update_cum": 303869.5662360145, "throughput/batch_count_per_second_total_recent": 0.09475214943441622, "throughput/batch_count_per_second_total_cum": 0.0886429739652581, "throughput/batch_count_per_second_update_recent": 0.1449921015688196, "throughput/batch_count_per_second_update_cum": 0.1448963004283974, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} -{"step": 1363148800, "throughput/token_count": 1363148800, "throughput/batch_count": 650, "throughput/flop_count": 0, "throughput/total_time": 7379.584015795961, "throughput/update_time": 4485.922716939414, "throughput/token_count_per_second_total_recent": 182871.73140531377, "throughput/token_count_per_second_total_cum": 184718.9214300138, "throughput/token_count_per_second_update_recent": 304070.40639546694, "throughput/token_count_per_second_update_cum": 303872.55555085174, "throughput/batch_count_per_second_total_recent": 0.08720003671899498, "throughput/batch_count_per_second_total_cum": 0.08808084556103411, "throughput/batch_count_per_second_update_recent": 0.14499206847928378, "throughput/batch_count_per_second_update_cum": 0.14489772584478938, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} -{"step": 1384120320, "throughput/token_count": 1384120320, "throughput/batch_count": 660, "throughput/flop_count": 0, "throughput/total_time": 7448.697525598982, "throughput/update_time": 4554.898137903423, "throughput/token_count_per_second_total_recent": 198620.3780027806, "throughput/token_count_per_second_total_cum": 185820.4491782873, "throughput/token_count_per_second_update_recent": 304069.83291899733, "throughput/token_count_per_second_update_cum": 303875.14234008704, "throughput/batch_count_per_second_total_recent": 0.09470957660807638, "throughput/batch_count_per_second_total_cum": 0.08860609492220273, "throughput/batch_count_per_second_update_recent": 0.14499179502439372, "throughput/batch_count_per_second_update_cum": 0.1448989593220172, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} -{"step": 1405091840, "throughput/token_count": 1405091840, "throughput/batch_count": 670, "throughput/flop_count": 0, "throughput/total_time": 7607.930423379992, "throughput/update_time": 4623.874382758397, "throughput/token_count_per_second_total_recent": 182855.55105213504, "throughput/token_count_per_second_total_cum": 184687.78784858508, "throughput/token_count_per_second_update_recent": 304064.10153348977, "throughput/token_count_per_second_update_cum": 303877.5978083092, "throughput/batch_count_per_second_total_recent": 0.08719232132536651, "throughput/batch_count_per_second_total_cum": 0.08806599991254095, "throughput/batch_count_per_second_update_recent": 0.14498906208681572, "throughput/batch_count_per_second_update_cum": 0.14490013018050632, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} -{"step": 1426063360, "throughput/token_count": 1426063360, "throughput/batch_count": 680, "throughput/flop_count": 0, "throughput/total_time": 7677.034741098003, "throughput/update_time": 4692.8429151014425, "throughput/token_count_per_second_total_recent": 198642.7964574922, "throughput/token_count_per_second_total_cum": 185757.0543957234, "throughput/token_count_per_second_update_recent": 304068.91578101035, "throughput/token_count_per_second_update_cum": 303880.4805102183, "throughput/batch_count_per_second_total_recent": 0.09472026656031236, "throughput/batch_count_per_second_total_cum": 0.08857586593424005, "throughput/batch_count_per_second_update_recent": 0.14499135769892232, "throughput/batch_count_per_second_update_cum": 0.1449015047598926, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} -{"step": 1447034880, "throughput/token_count": 1447034880, "throughput/batch_count": 690, "throughput/flop_count": 0, "throughput/total_time": 7836.185687895981, "throughput/update_time": 4761.811451301328, "throughput/token_count_per_second_total_recent": 182885.49754010158, "throughput/token_count_per_second_total_cum": 184660.61648272778, "throughput/token_count_per_second_update_recent": 304064.00939694623, "throughput/token_count_per_second_update_cum": 303883.2794617578, "throughput/batch_count_per_second_total_recent": 0.08720660092358665, "throughput/batch_count_per_second_total_cum": 0.08805304359566106, "throughput/batch_count_per_second_update_recent": 0.14498901815268814, "throughput/batch_count_per_second_update_cum": 0.14490283940399065, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} -{"step": 1468006400, "throughput/token_count": 1468006400, "throughput/batch_count": 700, "throughput/flop_count": 0, "throughput/total_time": 7905.308589838969, "throughput/update_time": 4830.788480303425, "throughput/token_count_per_second_total_recent": 198707.9817705395, "throughput/token_count_per_second_total_cum": 185698.81027628592, "throughput/token_count_per_second_update_recent": 304062.1741587079, "throughput/token_count_per_second_update_cum": 303885.4642436743, "throughput/batch_count_per_second_total_recent": 0.09475134933974243, "throughput/batch_count_per_second_total_cum": 0.08854809297384544, "throughput/batch_count_per_second_update_recent": 0.14498814304290195, "throughput/batch_count_per_second_update_cum": 0.144903881189191, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} -{"step": 1488977920, "throughput/token_count": 1488977920, "throughput/batch_count": 710, "throughput/flop_count": 0, "throughput/total_time": 8064.825394029962, "throughput/update_time": 4899.746858645405, "throughput/token_count_per_second_total_recent": 182882.16046033218, "throughput/token_count_per_second_total_cum": 184626.18187645145, "throughput/token_count_per_second_update_recent": 304065.20136069774, "throughput/token_count_per_second_update_cum": 303888.744246605, "throughput/batch_count_per_second_total_recent": 0.0872050096799527, "throughput/batch_count_per_second_total_cum": 0.08803662389586041, "throughput/batch_count_per_second_update_recent": 0.1449895865252961, "throughput/batch_count_per_second_update_cum": 0.1449054452164674, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} -{"step": 1509949440, "throughput/token_count": 1509949440, "throughput/batch_count": 720, "throughput/flop_count": 0, "throughput/total_time": 8133.938365876966, "throughput/update_time": 4968.716519999434, "throughput/token_count_per_second_total_recent": 198631.8527183569, "throughput/token_count_per_second_total_cum": 185635.71201061146, "throughput/token_count_per_second_update_recent": 304066.3046382736, "throughput/token_count_per_second_update_cum": 303891.2431253317, "throughput/batch_count_per_second_total_recent": 0.09471504817884298, "throughput/batch_count_per_second_total_cum": 0.08851800537615369, "throughput/batch_count_per_second_update_recent": 0.14499011260904007, "throughput/batch_count_per_second_update_cum": 0.14490663677469812, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} -{"step": 1530920960, "throughput/token_count": 1530920960, "throughput/batch_count": 730, "throughput/flop_count": 0, "throughput/total_time": 8293.024653506, "throughput/update_time": 5037.670779642533, "throughput/token_count_per_second_total_recent": 182886.1744953693, "throughput/token_count_per_second_total_cum": 184603.44976217818, "throughput/token_count_per_second_update_recent": 304072.9110926811, "throughput/token_count_per_second_update_cum": 303894.60267759545, "throughput/batch_count_per_second_total_recent": 0.08720692372101274, "throughput/batch_count_per_second_total_cum": 0.0880257843790904, "throughput/batch_count_per_second_update_recent": 0.14499326281198555, "throughput/batch_count_per_second_update_cum": 0.1449082387340524, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} -{"step": 1551892480, "throughput/token_count": 1551892480, "throughput/batch_count": 740, "throughput/flop_count": 0, "throughput/total_time": 8362.121594669996, "throughput/update_time": 5106.633266707533, "throughput/token_count_per_second_total_recent": 198732.92561066395, "throughput/token_count_per_second_total_cum": 185585.97389796076, "throughput/token_count_per_second_update_recent": 304077.487720853, "throughput/token_count_per_second_update_cum": 303897.3818851441, "throughput/batch_count_per_second_total_recent": 0.09476324348958204, "throughput/batch_count_per_second_total_cum": 0.08849428839586294, "throughput/batch_count_per_second_update_recent": 0.14499544511835719, "throughput/batch_count_per_second_update_cum": 0.14490956396348195, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} -{"step": 1572864000, "throughput/token_count": 1572864000, "throughput/batch_count": 750, "throughput/flop_count": 0, "throughput/total_time": 8521.302722692955, "throughput/update_time": 5175.6192292046035, "throughput/token_count_per_second_total_recent": 182956.7703258969, "throughput/token_count_per_second_total_cum": 184580.22806904034, "throughput/token_count_per_second_update_recent": 304071.38823100366, "throughput/token_count_per_second_update_cum": 303898.70860761136, "throughput/batch_count_per_second_total_recent": 0.08724058643622251, "throughput/batch_count_per_second_total_cum": 0.08801471141292588, "throughput/batch_count_per_second_update_recent": 0.14499253665495093, "throughput/batch_count_per_second_update_cum": 0.14491019659405296, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} -{"step": 1593835520, "throughput/token_count": 1593835520, "throughput/batch_count": 760, "throughput/flop_count": 0, "throughput/total_time": 8590.419437660952, "throughput/update_time": 5244.595247996622, "throughput/token_count_per_second_total_recent": 198736.98658764228, "throughput/token_count_per_second_total_cum": 185536.40268279827, "throughput/token_count_per_second_update_recent": 304069.8291321375, "throughput/token_count_per_second_update_cum": 303900.57661910664, "throughput/batch_count_per_second_total_recent": 0.09476517991430392, "throughput/batch_count_per_second_total_cum": 0.08847065099849619, "throughput/batch_count_per_second_update_recent": 0.14499179321867825, "throughput/batch_count_per_second_update_cum": 0.14491108733134586, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} -{"step": 1614807040, "throughput/token_count": 1614807040, "throughput/batch_count": 770, "throughput/flop_count": 0, "throughput/total_time": 8749.521732740977, "throughput/update_time": 5313.566678232513, "throughput/token_count_per_second_total_recent": 182976.38747267786, "throughput/token_count_per_second_total_cum": 184559.46385701778, "throughput/token_count_per_second_update_recent": 304072.2250598937, "throughput/token_count_per_second_update_cum": 303902.65856928023, "throughput/batch_count_per_second_total_recent": 0.08724994062074559, "throughput/batch_count_per_second_total_cum": 0.08800481026507272, "throughput/batch_count_per_second_update_recent": 0.14499293568606075, "throughput/batch_count_per_second_update_cum": 0.14491208008255016, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} -{"step": 1635778560, "throughput/token_count": 1635778560, "throughput/batch_count": 780, "throughput/flop_count": 0, "throughput/total_time": 8818.629101625993, "throughput/update_time": 5382.53907933255, "throughput/token_count_per_second_total_recent": 198746.13045774214, "throughput/token_count_per_second_total_cum": 185491.25279556122, "throughput/token_count_per_second_update_recent": 304069.1609170095, "throughput/token_count_per_second_update_cum": 303904.63234738674, "throughput/batch_count_per_second_total_recent": 0.09476954005133731, "throughput/batch_count_per_second_total_cum": 0.08844912185457288, "throughput/batch_count_per_second_update_recent": 0.14499147458887554, "throughput/batch_count_per_second_update_cum": 0.1449130212532934, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} -{"step": 1656750080, "throughput/token_count": 1656750080, "throughput/batch_count": 790, "throughput/flop_count": 0, "throughput/total_time": 8977.726054205967, "throughput/update_time": 5451.502941412618, "throughput/token_count_per_second_total_recent": 182984.68564197747, "throughput/token_count_per_second_total_cum": 184540.05724799662, "throughput/token_count_per_second_update_recent": 304071.98751209263, "throughput/token_count_per_second_update_cum": 303907.0322083868, "throughput/batch_count_per_second_total_recent": 0.0872538974962127, "throughput/batch_count_per_second_total_cum": 0.08799555647277671, "throughput/batch_count_per_second_update_recent": 0.14499282241444236, "throughput/batch_count_per_second_update_cum": 0.14491416559619275, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} -{"step": 1677721600, "throughput/token_count": 1677721600, "throughput/batch_count": 800, "throughput/flop_count": 0, "throughput/total_time": 9046.829455698957, "throughput/update_time": 5520.4718770905165, "throughput/token_count_per_second_total_recent": 198832.84809121367, "throughput/token_count_per_second_total_cum": 185448.571592464, "throughput/token_count_per_second_update_recent": 304073.84702991234, "throughput/token_count_per_second_update_cum": 303909.0928009977, "throughput/batch_count_per_second_total_recent": 0.09481089024124797, "throughput/batch_count_per_second_total_cum": 0.0884287698709793, "throughput/batch_count_per_second_update_recent": 0.14499370910163514, "throughput/batch_count_per_second_update_cum": 0.14491514816331755, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} -{"step": 1698693120, "throughput/token_count": 1698693120, "throughput/batch_count": 810, "throughput/flop_count": 0, "throughput/total_time": 9206.457426826004, "throughput/update_time": 5589.4371389435255, "throughput/token_count_per_second_total_recent": 182969.85785432887, "throughput/token_count_per_second_total_cum": 184511.04928268128, "throughput/token_count_per_second_update_recent": 304073.1268019626, "throughput/token_count_per_second_update_cum": 303911.30229636584, "throughput/batch_count_per_second_total_recent": 0.08724682705608791, "throughput/batch_count_per_second_total_cum": 0.08798172439703049, "throughput/batch_count_per_second_update_recent": 0.14499336567018634, "throughput/batch_count_per_second_update_cum": 0.14491620173280995, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} -{"step": 1719664640, "throughput/token_count": 1719664640, "throughput/batch_count": 820, "throughput/flop_count": 0, "throughput/total_time": 9275.580202061974, "throughput/update_time": 5658.411268384545, "throughput/token_count_per_second_total_recent": 198728.84064910773, "throughput/token_count_per_second_total_cum": 185396.9889255786, "throughput/token_count_per_second_update_recent": 304069.52367064403, "throughput/token_count_per_second_update_cum": 303912.98165411677, "throughput/batch_count_per_second_total_recent": 0.0947612956281222, "throughput/batch_count_per_second_total_cum": 0.08840417333868913, "throughput/batch_count_per_second_update_recent": 0.14499164756328775, "throughput/batch_count_per_second_update_cum": 0.1449170025129875, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} -{"step": 1740636160, "throughput/token_count": 1740636160, "throughput/batch_count": 830, "throughput/flop_count": 0, "throughput/total_time": 9434.65409148595, "throughput/update_time": 5727.392325014458, "throughput/token_count_per_second_total_recent": 182969.63955931875, "throughput/token_count_per_second_total_cum": 184493.9033398999, "throughput/token_count_per_second_update_recent": 304057.62108125945, "throughput/token_count_per_second_update_cum": 303914.25298346503, "throughput/batch_count_per_second_total_recent": 0.08724672296491563, "throughput/batch_count_per_second_total_cum": 0.0879735485743999, "throughput/batch_count_per_second_update_recent": 0.14498597196639035, "throughput/batch_count_per_second_update_cum": 0.14491760873006107, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} -{"step": 1761607680, "throughput/token_count": 1761607680, "throughput/batch_count": 840, "throughput/flop_count": 0, "throughput/total_time": 9503.765275373997, "throughput/update_time": 5796.35883763741, "throughput/token_count_per_second_total_recent": 198740.74784775727, "throughput/token_count_per_second_total_cum": 185358.92132822867, "throughput/token_count_per_second_update_recent": 304058.04690969206, "throughput/token_count_per_second_update_cum": 303916.2566267256, "throughput/batch_count_per_second_total_recent": 0.09476697342288841, "throughput/batch_count_per_second_total_cum": 0.08838602129374917, "throughput/batch_count_per_second_update_recent": 0.14498617501721003, "throughput/batch_count_per_second_update_cum": 0.1449185641416195, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} -{"step": 1782579200, "throughput/token_count": 1782579200, "throughput/batch_count": 850, "throughput/flop_count": 0, "throughput/total_time": 9662.829677159956, "throughput/update_time": 5865.338317954331, "throughput/token_count_per_second_total_recent": 182987.64390897352, "throughput/token_count_per_second_total_cum": 184477.9696586689, "throughput/token_count_per_second_update_recent": 304060.91839370667, "throughput/token_count_per_second_update_cum": 303917.5412172498, "throughput/batch_count_per_second_total_recent": 0.08725530810784031, "throughput/batch_count_per_second_total_cum": 0.087965950803122, "throughput/batch_count_per_second_update_recent": 0.1449875442474874, "throughput/batch_count_per_second_update_cum": 0.14491917668211451, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} -{"step": 1803550720, "throughput/token_count": 1803550720, "throughput/batch_count": 860, "throughput/flop_count": 0, "throughput/total_time": 9731.945241109002, "throughput/update_time": 5934.31139906036, "throughput/token_count_per_second_total_recent": 198750.8356684171, "throughput/token_count_per_second_total_cum": 185322.73613517339, "throughput/token_count_per_second_update_recent": 304060.00535683846, "throughput/token_count_per_second_update_cum": 303919.1236721374, "throughput/batch_count_per_second_total_recent": 0.0947717836706243, "throughput/batch_count_per_second_total_cum": 0.08836876684912366, "throughput/batch_count_per_second_update_recent": 0.14498710887758182, "throughput/batch_count_per_second_update_cum": 0.1449199312554061, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} -{"step": 1824522240, "throughput/token_count": 1824522240, "throughput/batch_count": 870, "throughput/flop_count": 0, "throughput/total_time": 9891.181177204999, "throughput/update_time": 6003.2874812923255, "throughput/token_count_per_second_total_recent": 182965.4830551361, "throughput/token_count_per_second_total_cum": 184459.49046052803, "throughput/token_count_per_second_update_recent": 304059.8395317841, "throughput/token_count_per_second_update_cum": 303920.51783054636, "throughput/batch_count_per_second_total_recent": 0.08724474098927312, "throughput/batch_count_per_second_total_cum": 0.08795713923479463, "throughput/batch_count_per_second_update_recent": 0.14498702980603415, "throughput/batch_count_per_second_update_cum": 0.1449205960419399, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} -{"step": 1845493760, "throughput/token_count": 1845493760, "throughput/batch_count": 880, "throughput/flop_count": 0, "throughput/total_time": 9960.290040618973, "throughput/update_time": 6072.254921466229, "throughput/token_count_per_second_total_recent": 198725.5654495691, "throughput/token_count_per_second_total_cum": 185285.14254845068, "throughput/token_count_per_second_update_recent": 304061.129545677, "throughput/token_count_per_second_update_cum": 303922.3128587593, "throughput/batch_count_per_second_total_recent": 0.09475973389128166, "throughput/batch_count_per_second_total_cum": 0.08835084083006414, "throughput/batch_count_per_second_update_recent": 0.14498764493259286, "throughput/batch_count_per_second_update_cum": 0.14492145197809186, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} -{"step": 1866465280, "throughput/token_count": 1866465280, "throughput/batch_count": 890, "throughput/flop_count": 0, "throughput/total_time": 10119.394410843961, "throughput/update_time": 6141.227180292248, "throughput/token_count_per_second_total_recent": 182963.75674200882, "throughput/token_count_per_second_total_cum": 184444.36536635953, "throughput/token_count_per_second_update_recent": 304056.6683149727, "throughput/token_count_per_second_update_cum": 303923.8290987924, "throughput/batch_count_per_second_total_recent": 0.08724391781902734, "throughput/batch_count_per_second_total_cum": 0.08794992702787377, "throughput/batch_count_per_second_update_recent": 0.14498551765202175, "throughput/batch_count_per_second_update_cum": 0.1449221749776804, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} -{"step": 1887436800, "throughput/token_count": 1887436800, "throughput/batch_count": 900, "throughput/flop_count": 0, "throughput/total_time": 10188.515749696002, "throughput/update_time": 6210.206845312321, "throughput/token_count_per_second_total_recent": 198819.52035073345, "throughput/token_count_per_second_total_cum": 185251.3993568019, "throughput/token_count_per_second_update_recent": 304053.60401178827, "throughput/token_count_per_second_update_cum": 303924.9492027311, "throughput/batch_count_per_second_total_recent": 0.09480453507935212, "throughput/batch_count_per_second_total_cum": 0.08833475082244964, "throughput/batch_count_per_second_update_recent": 0.1449840564783994, "throughput/batch_count_per_second_update_cum": 0.1449227090848594, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} -{"step": 1908408320, "throughput/token_count": 1908408320, "throughput/batch_count": 910, "throughput/flop_count": 0, "throughput/total_time": 10348.059376804973, "throughput/update_time": 6279.188010795449, "throughput/token_count_per_second_total_recent": 182974.7744549283, "throughput/token_count_per_second_total_cum": 184421.85636059163, "throughput/token_count_per_second_update_recent": 304045.1199409977, "throughput/token_count_per_second_update_cum": 303925.97207138606, "throughput/batch_count_per_second_total_recent": 0.08724917147394576, "throughput/batch_count_per_second_total_cum": 0.08793919389752942, "throughput/batch_count_per_second_update_recent": 0.14498001095819363, "throughput/batch_count_per_second_update_cum": 0.14492319682664206, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} -{"step": 1929379840, "throughput/token_count": 1929379840, "throughput/batch_count": 920, "throughput/flop_count": 0, "throughput/total_time": 10417.185683044954, "throughput/update_time": 6348.16824121651, "throughput/token_count_per_second_total_recent": 198727.92219343947, "throughput/token_count_per_second_total_cum": 185211.23638414787, "throughput/token_count_per_second_update_recent": 304043.76173262455, "throughput/token_count_per_second_update_cum": 303927.0174777645, "throughput/batch_count_per_second_total_recent": 0.09476085767433141, "throughput/batch_count_per_second_total_cum": 0.0883155996247043, "throughput/batch_count_per_second_update_recent": 0.14497936331397274, "throughput/batch_count_per_second_update_cum": 0.14492369531524874, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} -{"step": 1950351360, "throughput/token_count": 1950351360, "throughput/batch_count": 930, "throughput/flop_count": 0, "throughput/total_time": 10576.369776681007, "throughput/update_time": 6417.1375977324205, "throughput/token_count_per_second_total_recent": 182956.7164887483, "throughput/token_count_per_second_total_cum": 184406.50253172632, "throughput/token_count_per_second_update_recent": 304048.03721056005, "throughput/token_count_per_second_update_cum": 303928.5554184131, "throughput/batch_count_per_second_total_recent": 0.08724056076466956, "throughput/batch_count_per_second_total_cum": 0.08793187262140575, "throughput/batch_count_per_second_update_recent": 0.14498140202072146, "throughput/batch_count_per_second_update_cum": 0.14492442866249708, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} -{"step": 1971322880, "throughput/token_count": 1971322880, "throughput/batch_count": 940, "throughput/flop_count": 0, "throughput/total_time": 10645.485156638955, "throughput/update_time": 6486.112916803453, "throughput/token_count_per_second_total_recent": 198704.73259386074, "throughput/token_count_per_second_total_cum": 185179.24274880072, "throughput/token_count_per_second_update_recent": 304044.25916855043, "throughput/token_count_per_second_update_cum": 303929.7812551074, "throughput/batch_count_per_second_total_recent": 0.09474980001156842, "throughput/batch_count_per_second_total_cum": 0.08830034387054478, "throughput/batch_count_per_second_update_recent": 0.14497960050990602, "throughput/batch_count_per_second_update_cum": 0.1449250131869828, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} -{"step": 1992294400, "throughput/token_count": 1992294400, "throughput/batch_count": 950, "throughput/flop_count": 0, "throughput/total_time": 10804.645974584972, "throughput/update_time": 6555.09181263647, "throughput/token_count_per_second_total_recent": 182940.34151577365, "throughput/token_count_per_second_total_cum": 184392.38126694178, "throughput/token_count_per_second_update_recent": 304044.3705579108, "throughput/token_count_per_second_update_cum": 303930.8154554582, "throughput/batch_count_per_second_total_recent": 0.08723275256909067, "throughput/batch_count_per_second_total_cum": 0.08792513907763566, "throughput/batch_count_per_second_update_recent": 0.14497965362449208, "throughput/batch_count_per_second_update_cum": 0.1449255063321391, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} -{"step": 2013265920, "throughput/token_count": 2013265920, "throughput/batch_count": 960, "throughput/flop_count": 0, "throughput/total_time": 10873.77756452799, "throughput/update_time": 6624.0805201563635, "throughput/token_count_per_second_total_recent": 198716.15212212942, "throughput/token_count_per_second_total_cum": 185148.71286015605, "throughput/token_count_per_second_update_recent": 304037.8814847285, "throughput/token_count_per_second_update_cum": 303931.3779284308, "throughput/batch_count_per_second_total_recent": 0.09475524526697608, "throughput/batch_count_per_second_total_cum": 0.0882857860852032, "throughput/batch_count_per_second_update_recent": 0.14497655939327644, "throughput/batch_count_per_second_update_cum": 0.14492577454015293, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} -{"step": 2034237440, "throughput/token_count": 2034237440, "throughput/batch_count": 970, "throughput/flop_count": 0, "throughput/total_time": 11032.958788217977, "throughput/update_time": 6693.060108309321, "throughput/token_count_per_second_total_recent": 182945.7826390865, "throughput/token_count_per_second_total_cum": 184378.23244407916, "throughput/token_count_per_second_update_recent": 304035.164648976, "throughput/token_count_per_second_update_cum": 303932.34291658737, "throughput/batch_count_per_second_total_recent": 0.08723534709886861, "throughput/batch_count_per_second_total_cum": 0.08791839239314993, "throughput/batch_count_per_second_update_recent": 0.14497526390503693, "throughput/batch_count_per_second_update_cum": 0.1449262346823632, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} -{"step": 2055208960, "throughput/token_count": 2055208960, "throughput/batch_count": 980, "throughput/flop_count": 0, "throughput/total_time": 11102.099602014001, "throughput/update_time": 6762.03791546938, "throughput/token_count_per_second_total_recent": 198696.03516649496, "throughput/token_count_per_second_total_cum": 185118.94449471254, "throughput/token_count_per_second_update_recent": 304031.3546895658, "throughput/token_count_per_second_update_cum": 303933.36826732947, "throughput/batch_count_per_second_total_recent": 0.09474565275501964, "throughput/batch_count_per_second_total_cum": 0.08827159142242076, "throughput/batch_count_per_second_update_recent": 0.1449734471748189, "throughput/batch_count_per_second_update_cum": 0.14492672360769723, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 20971520, "throughput/token_count": 20971520, "throughput/batch_count": 10, "throughput/flop_count": 0, "throughput/total_time": 57.12486812804127, "throughput/update_time": 56.932655182085, "throughput/token_count_per_second_total_recent": 392645.0030472229, "throughput/token_count_per_second_total_cum": 367117.1713341001, "throughput/token_count_per_second_update_recent": 393740.61648576276, "throughput/token_count_per_second_update_cum": 368356.61243846413, "throughput/batch_count_per_second_total_recent": 0.18722772743569513, "throughput/batch_count_per_second_total_cum": 0.17505510870652202, "throughput/batch_count_per_second_update_recent": 0.18775015663421762, "throughput/batch_count_per_second_update_cum": 0.17564612028048712, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 41943040, "throughput/token_count": 41943040, "throughput/batch_count": 20, "throughput/flop_count": 0, "throughput/total_time": 110.51794190204237, "throughput/update_time": 110.19485108501976, "throughput/token_count_per_second_total_recent": 392713.9581716941, "throughput/token_count_per_second_total_cum": 379513.4009749859, "throughput/token_count_per_second_update_recent": 393740.9144137965, "throughput/token_count_per_second_update_cum": 380626.13259161497, "throughput/batch_count_per_second_total_recent": 0.1872606078012915, "throughput/batch_count_per_second_total_cum": 0.18096609162091537, "throughput/batch_count_per_second_update_recent": 0.18775029869737458, "throughput/batch_count_per_second_update_cum": 0.18149668340283154, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 62914560, "throughput/token_count": 62914560, "throughput/batch_count": 30, "throughput/flop_count": 0, "throughput/total_time": 263.8700318510528, "throughput/update_time": 163.45965232816525, "throughput/token_count_per_second_total_recent": 238672.82502651925, "throughput/token_count_per_second_total_cum": 238430.1072715734, "throughput/token_count_per_second_update_recent": 393734.36561396916, "throughput/token_count_per_second_update_cum": 384893.514111307, "throughput/batch_count_per_second_total_recent": 0.11380807162595713, "throughput/batch_count_per_second_total_cum": 0.1136923347814433, "throughput/batch_count_per_second_update_recent": 0.18774717598627527, "throughput/batch_count_per_second_update_cum": 0.18353152947965, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 83886080, "throughput/token_count": 83886080, "throughput/batch_count": 40, "throughput/flop_count": 0, "throughput/total_time": 317.24918574804906, "throughput/update_time": 216.73109497816768, "throughput/token_count_per_second_total_recent": 265381.2077873784, "throughput/token_count_per_second_total_cum": 264417.0064682849, "throughput/token_count_per_second_update_recent": 393718.58740969433, "throughput/token_count_per_second_update_cum": 387051.4289075605, "throughput/batch_count_per_second_total_recent": 0.12654362096184654, "throughput/batch_count_per_second_total_cum": 0.12608385394491428, "throughput/batch_count_per_second_update_recent": 0.18773965235218731, "throughput/batch_count_per_second_update_cum": 0.18456050343874, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 104857600, "throughput/token_count": 104857600, "throughput/batch_count": 50, "throughput/flop_count": 0, "throughput/total_time": 470.2395834400086, "throughput/update_time": 270.0089025082416, "throughput/token_count_per_second_total_recent": 222818.51165591736, "throughput/token_count_per_second_total_cum": 222987.60821647703, "throughput/token_count_per_second_update_recent": 393699.64914062695, "throughput/token_count_per_second_update_cum": 388348.67675075785, "throughput/batch_count_per_second_total_recent": 0.10624814589305752, "throughput/batch_count_per_second_total_cum": 0.10632877741645672, "throughput/batch_count_per_second_update_recent": 0.18773062188178394, "throughput/batch_count_per_second_update_cum": 0.18517907941377537, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 125829120, "throughput/token_count": 125829120, "throughput/batch_count": 60, "throughput/flop_count": 0, "throughput/total_time": 523.6280801940011, "throughput/update_time": 323.28231063415296, "throughput/token_count_per_second_total_recent": 240455.60949345818, "throughput/token_count_per_second_total_cum": 240302.46802918028, "throughput/token_count_per_second_update_recent": 393692.6424976522, "throughput/token_count_per_second_update_cum": 389223.6471373045, "throughput/batch_count_per_second_total_recent": 0.11465816950486096, "throughput/batch_count_per_second_total_cum": 0.11458514596423162, "throughput/batch_count_per_second_update_recent": 0.18772728085405932, "throughput/batch_count_per_second_update_cum": 0.18559629780640816, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 146800640, "throughput/token_count": 146800640, "throughput/batch_count": 70, "throughput/flop_count": 0, "throughput/total_time": 676.4629730410525, "throughput/update_time": 376.5575643811608, "throughput/token_count_per_second_total_recent": 216814.16455245067, "throughput/token_count_per_second_total_cum": 217012.0846970454, "throughput/token_count_per_second_update_recent": 393685.6901068673, "throughput/token_count_per_second_update_cum": 389849.13300375186, "throughput/batch_count_per_second_total_recent": 0.1033850500833753, "throughput/batch_count_per_second_total_cum": 0.10347942576267499, "throughput/batch_count_per_second_update_recent": 0.18772396569579472, "throughput/batch_count_per_second_update_cum": 0.18589455270946115, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 167772160, "throughput/token_count": 167772160, "throughput/batch_count": 80, "throughput/flop_count": 0, "throughput/total_time": 729.8428020050051, "throughput/update_time": 429.83724463917315, "throughput/token_count_per_second_total_recent": 229852.70072141368, "throughput/token_count_per_second_total_cum": 229874.37779628806, "throughput/token_count_per_second_update_recent": 393676.35714154714, "throughput/token_count_per_second_update_cum": 390315.5487162038, "throughput/batch_count_per_second_total_recent": 0.10960230861731228, "throughput/batch_count_per_second_total_cum": 0.10961264505209353, "throughput/batch_count_per_second_update_recent": 0.18771951539113385, "throughput/batch_count_per_second_update_cum": 0.186116957052328, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 188743680, "throughput/token_count": 188743680, "throughput/batch_count": 90, "throughput/flop_count": 0, "throughput/total_time": 884.0807184120058, "throughput/update_time": 483.11551754607353, "throughput/token_count_per_second_total_recent": 213304.06402356713, "throughput/token_count_per_second_total_cum": 213491.45623153413, "throughput/token_count_per_second_update_recent": 393670.29032426636, "throughput/token_count_per_second_update_cum": 390680.2268714128, "throughput/batch_count_per_second_total_recent": 0.10171130372217518, "throughput/batch_count_per_second_total_cum": 0.10180065929009158, "throughput/batch_count_per_second_update_recent": 0.18771662250722235, "throughput/batch_count_per_second_update_cum": 0.18629084914751662, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 209715200, "throughput/token_count": 209715200, "throughput/batch_count": 100, "throughput/flop_count": 0, "throughput/total_time": 937.4650811910396, "throughput/update_time": 536.3882422860479, "throughput/token_count_per_second_total_recent": 223627.53996271305, "throughput/token_count_per_second_total_cum": 223704.545596044, "throughput/token_count_per_second_update_recent": 393669.59062649374, "throughput/token_count_per_second_update_cum": 390976.50445544627, "throughput/batch_count_per_second_total_recent": 0.10663392065177586, "throughput/batch_count_per_second_total_cum": 0.10667063979913902, "throughput/batch_count_per_second_update_recent": 0.18771628886532485, "throughput/batch_count_per_second_update_cum": 0.18643212530872644, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 230686720, "throughput/token_count": 230686720, "throughput/batch_count": 110, "throughput/flop_count": 0, "throughput/total_time": 1091.3179251340334, "throughput/update_time": 589.6708546730806, "throughput/token_count_per_second_total_recent": 201797.45162836113, "throughput/token_count_per_second_total_cum": 211383.60755108786, "throughput/token_count_per_second_update_recent": 393655.9632455835, "throughput/token_count_per_second_update_cum": 391212.68784412794, "throughput/batch_count_per_second_total_recent": 0.0962245233671003, "throughput/batch_count_per_second_total_cum": 0.10079555871538537, "throughput/batch_count_per_second_update_recent": 0.18770979082373787, "throughput/batch_count_per_second_update_cum": 0.1865447463246002, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 251658240, "throughput/token_count": 251658240, "throughput/batch_count": 120, "throughput/flop_count": 0, "throughput/total_time": 1144.6911778100184, "throughput/update_time": 642.9422557381331, "throughput/token_count_per_second_total_recent": 223522.18253595202, "throughput/token_count_per_second_total_cum": 219848.15195436677, "throughput/token_count_per_second_update_recent": 393648.5907568983, "throughput/token_count_per_second_update_cum": 391416.5506373235, "throughput/batch_count_per_second_total_recent": 0.10658368231580354, "throughput/batch_count_per_second_total_cum": 0.10483176801412905, "throughput/batch_count_per_second_update_recent": 0.18770627534718431, "throughput/batch_count_per_second_update_cum": 0.18664195567957093, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 272629760, "throughput/token_count": 272629760, "throughput/batch_count": 130, "throughput/flop_count": 0, "throughput/total_time": 1298.4373718530405, "throughput/update_time": 696.2163101581973, "throughput/token_count_per_second_total_recent": 201724.22959856785, "throughput/token_count_per_second_total_cum": 209967.58558398666, "throughput/token_count_per_second_update_recent": 393642.6934132873, "throughput/token_count_per_second_update_cum": 391587.7235597251, "throughput/batch_count_per_second_total_recent": 0.09618960838249581, "throughput/batch_count_per_second_total_cum": 0.10012034682463963, "throughput/batch_count_per_second_update_recent": 0.1877034632746159, "throughput/batch_count_per_second_update_cum": 0.18672357728945022, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 293601280, "throughput/token_count": 293601280, "throughput/batch_count": 140, "throughput/flop_count": 0, "throughput/total_time": 1351.811079526029, "throughput/update_time": 749.4968144011218, "throughput/token_count_per_second_total_recent": 223338.99603887036, "throughput/token_count_per_second_total_cum": 217191.05905164074, "throughput/token_count_per_second_update_recent": 393634.9368303475, "throughput/token_count_per_second_update_cum": 391731.19132547517, "throughput/batch_count_per_second_total_recent": 0.10649633218711393, "throughput/batch_count_per_second_total_cum": 0.10356476738531148, "throughput/batch_count_per_second_update_recent": 0.18769976464764954, "throughput/batch_count_per_second_update_cum": 0.18679198805116423, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 314572800, "throughput/token_count": 314572800, "throughput/batch_count": 150, "throughput/flop_count": 0, "throughput/total_time": 1504.7933711430524, "throughput/update_time": 802.7697926640394, "throughput/token_count_per_second_total_recent": 201726.8207715387, "throughput/token_count_per_second_total_cum": 209047.17287599968, "throughput/token_count_per_second_update_recent": 393639.5128984192, "throughput/token_count_per_second_update_cum": 391859.28877078375, "throughput/batch_count_per_second_total_recent": 0.09619084395005165, "throughput/batch_count_per_second_total_cum": 0.09968145984458908, "throughput/batch_count_per_second_update_recent": 0.1877019466869446, "throughput/batch_count_per_second_update_cum": 0.18685306967295826, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 335544320, "throughput/token_count": 335544320, "throughput/batch_count": 160, "throughput/flop_count": 0, "throughput/total_time": 1558.167400816048, "throughput/update_time": 856.0381595880608, "throughput/token_count_per_second_total_recent": 223306.63770638386, "throughput/token_count_per_second_total_cum": 215345.488439989, "throughput/token_count_per_second_update_recent": 393642.67580116354, "throughput/token_count_per_second_update_cum": 391973.55426476465, "throughput/batch_count_per_second_total_recent": 0.10648090253180688, "throughput/batch_count_per_second_total_cum": 0.10268473073958827, "throughput/batch_count_per_second_update_recent": 0.18770345487650086, "throughput/batch_count_per_second_update_cum": 0.18690755570638878, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 356515840, "throughput/token_count": 356515840, "throughput/batch_count": 170, "throughput/flop_count": 0, "throughput/total_time": 1711.111571622023, "throughput/update_time": 909.3183224739623, "throughput/token_count_per_second_total_recent": 201707.3322526355, "throughput/token_count_per_second_total_cum": 208353.35691292537, "throughput/token_count_per_second_update_recent": 393640.71795570373, "throughput/token_count_per_second_update_cum": 392069.34600199765, "throughput/batch_count_per_second_total_recent": 0.09618155110008025, "throughput/batch_count_per_second_total_cum": 0.09935062261244076, "throughput/batch_count_per_second_update_recent": 0.1877025213030356, "throughput/batch_count_per_second_update_cum": 0.1869532327661503, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 377487360, "throughput/token_count": 377487360, "throughput/batch_count": 180, "throughput/flop_count": 0, "throughput/total_time": 1764.4956908360473, "throughput/update_time": 962.5948072728934, "throughput/token_count_per_second_total_recent": 223617.5979590691, "throughput/token_count_per_second_total_cum": 213934.98548083176, "throughput/token_count_per_second_update_recent": 393641.4053810168, "throughput/token_count_per_second_update_cum": 392156.03195434983, "throughput/batch_count_per_second_total_recent": 0.10662917993501143, "throughput/batch_count_per_second_total_cum": 0.10201215051690662, "throughput/batch_count_per_second_update_recent": 0.18770284909296836, "throughput/batch_count_per_second_update_cum": 0.18699456784932605, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 398458880, "throughput/token_count": 398458880, "throughput/batch_count": 190, "throughput/flop_count": 0, "throughput/total_time": 1917.3773277360015, "throughput/update_time": 1015.8676893726224, "throughput/token_count_per_second_total_recent": 201973.07559840105, "throughput/token_count_per_second_total_cum": 207814.53615626707, "throughput/token_count_per_second_update_recent": 393645.8678114634, "throughput/token_count_per_second_update_cum": 392235.0165955957, "throughput/batch_count_per_second_total_recent": 0.09630826740188649, "throughput/batch_count_per_second_total_cum": 0.09909369285405496, "throughput/batch_count_per_second_update_recent": 0.18770497694562122, "throughput/batch_count_per_second_update_cum": 0.18703223066119942, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 419430400, "throughput/token_count": 419430400, "throughput/batch_count": 200, "throughput/flop_count": 0, "throughput/total_time": 1970.7506705410196, "throughput/update_time": 1069.132912081608, "throughput/token_count_per_second_total_recent": 223854.49803022106, "throughput/token_count_per_second_total_cum": 212827.73425867007, "throughput/token_count_per_second_update_recent": 393656.29517616733, "throughput/token_count_per_second_update_cum": 392308.94050709426, "throughput/batch_count_per_second_total_recent": 0.10674214269171765, "throughput/batch_count_per_second_total_cum": 0.1014841719907141, "throughput/batch_count_per_second_update_recent": 0.18770994910057417, "throughput/batch_count_per_second_update_cum": 0.18706748032908166, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 440401920, "throughput/token_count": 440401920, "throughput/batch_count": 210, "throughput/flop_count": 0, "throughput/total_time": 2124.242168805038, "throughput/update_time": 1122.4119238386047, "throughput/token_count_per_second_total_recent": 202045.9687591992, "throughput/token_count_per_second_total_cum": 207321.89882462495, "throughput/token_count_per_second_update_recent": 393654.54204836424, "throughput/token_count_per_second_update_cum": 392371.0276471785, "throughput/batch_count_per_second_total_recent": 0.09634302556953392, "throughput/batch_count_per_second_total_cum": 0.09885878506880996, "throughput/batch_count_per_second_update_recent": 0.18770911314409458, "throughput/batch_count_per_second_update_cum": 0.1870970857845204, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 461373440, "throughput/token_count": 461373440, "throughput/batch_count": 220, "throughput/flop_count": 0, "throughput/total_time": 2177.6055200890405, "throughput/update_time": 1175.6836349036312, "throughput/token_count_per_second_total_recent": 223919.25561582594, "throughput/token_count_per_second_total_cum": 211871.90964740704, "throughput/token_count_per_second_update_recent": 393653.4269920945, "throughput/token_count_per_second_update_cum": 392429.9244310039, "throughput/batch_count_per_second_total_recent": 0.10677302151480958, "throughput/batch_count_per_second_total_cum": 0.10102839929933884, "throughput/batch_count_per_second_update_recent": 0.18770858144383168, "throughput/batch_count_per_second_update_cum": 0.187125169959547, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 482344960, "throughput/token_count": 482344960, "throughput/batch_count": 230, "throughput/flop_count": 0, "throughput/total_time": 2330.6941529700416, "throughput/update_time": 1228.951100654609, "throughput/token_count_per_second_total_recent": 202177.050586507, "throughput/token_count_per_second_total_cum": 206953.3488061228, "throughput/token_count_per_second_update_recent": 393659.76314917125, "throughput/token_count_per_second_update_cum": 392485.07100329356, "throughput/batch_count_per_second_total_recent": 0.09640553025555944, "throughput/batch_count_per_second_total_cum": 0.0986830467253317, "throughput/batch_count_per_second_update_recent": 0.1877116027589661, "throughput/batch_count_per_second_update_cum": 0.18715146589436224, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 503316480, "throughput/token_count": 503316480, "throughput/batch_count": 240, "throughput/flop_count": 0, "throughput/total_time": 2384.0519924180117, "throughput/update_time": 1282.2238651026273, "throughput/token_count_per_second_total_recent": 223897.5079516489, "throughput/token_count_per_second_total_cum": 211118.08031062025, "throughput/token_count_per_second_update_recent": 393664.08302371926, "throughput/token_count_per_second_update_cum": 392534.0135201081, "throughput/batch_count_per_second_total_recent": 0.10676265142042585, "throughput/batch_count_per_second_total_cum": 0.10066894546061528, "throughput/batch_count_per_second_update_recent": 0.18771366263566935, "throughput/batch_count_per_second_update_cum": 0.1871748035049954, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 524288000, "throughput/token_count": 524288000, "throughput/batch_count": 250, "throughput/flop_count": 0, "throughput/total_time": 2537.098761650035, "throughput/update_time": 1335.4923671315191, "throughput/token_count_per_second_total_recent": 202168.58277547942, "throughput/token_count_per_second_total_cum": 206648.63659427373, "throughput/token_count_per_second_update_recent": 393667.7481505122, "throughput/token_count_per_second_update_cum": 392580.3043907387, "throughput/batch_count_per_second_total_recent": 0.09640149248861285, "throughput/batch_count_per_second_total_cum": 0.09853774862016379, "throughput/batch_count_per_second_update_recent": 0.18771541030431377, "throughput/batch_count_per_second_update_cum": 0.18719687671219765, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 545259520, "throughput/token_count": 545259520, "throughput/batch_count": 260, "throughput/flop_count": 0, "throughput/total_time": 2590.4739345350536, "throughput/update_time": 1388.759745098534, "throughput/token_count_per_second_total_recent": 223871.88912167045, "throughput/token_count_per_second_total_cum": 210486.39506881003, "throughput/token_count_per_second_update_recent": 393668.41239914304, "throughput/token_count_per_second_update_cum": 392623.36190577963, "throughput/batch_count_per_second_total_recent": 0.10675043541034243, "throughput/batch_count_per_second_total_cum": 0.10036773446503164, "throughput/batch_count_per_second_update_recent": 0.18771572704274322, "throughput/batch_count_per_second_update_cum": 0.18721740813530904, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 566231040, "throughput/token_count": 566231040, "throughput/batch_count": 270, "throughput/flop_count": 0, "throughput/total_time": 2743.4162192750373, "throughput/update_time": 1442.0279306704178, "throughput/token_count_per_second_total_recent": 202168.50574709332, "throughput/token_count_per_second_total_cum": 206396.3302475589, "throughput/token_count_per_second_update_recent": 393678.5636181484, "throughput/token_count_per_second_update_cum": 392663.0184872714, "throughput/batch_count_per_second_total_recent": 0.09640145575861613, "throughput/batch_count_per_second_total_cum": 0.09841743957879967, "throughput/batch_count_per_second_update_recent": 0.18772056752116603, "throughput/batch_count_per_second_update_cum": 0.18723631786693162, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 587202560, "throughput/token_count": 587202560, "throughput/batch_count": 280, "throughput/flop_count": 0, "throughput/total_time": 2796.791148387012, "throughput/update_time": 1495.3029013883206, "throughput/token_count_per_second_total_recent": 223861.5201390833, "throughput/token_count_per_second_total_cum": 209955.8132321236, "throughput/token_count_per_second_update_recent": 393677.4699385466, "throughput/token_count_per_second_update_cum": 392698.06769906566, "throughput/batch_count_per_second_total_recent": 0.10674549109415211, "throughput/batch_count_per_second_total_cum": 0.10011473332983188, "throughput/batch_count_per_second_update_recent": 0.18772004601409273, "throughput/batch_count_per_second_update_cum": 0.18725303063348087, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 608174080, "throughput/token_count": 608174080, "throughput/batch_count": 290, "throughput/flop_count": 0, "throughput/total_time": 2949.6854955510353, "throughput/update_time": 1548.5792963503627, "throughput/token_count_per_second_total_recent": 202167.57423837474, "throughput/token_count_per_second_total_cum": 206182.68656685585, "throughput/token_count_per_second_update_recent": 393675.4593654439, "throughput/token_count_per_second_update_cum": 392730.34415048896, "throughput/batch_count_per_second_total_recent": 0.09640101158064592, "throughput/batch_count_per_second_total_cum": 0.09831556633322518, "throughput/batch_count_per_second_update_recent": 0.18771908729812808, "throughput/batch_count_per_second_update_cum": 0.18726842124485443, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 629145600, "throughput/token_count": 629145600, "throughput/batch_count": 300, "throughput/flop_count": 0, "throughput/total_time": 3003.058338998002, "throughput/update_time": 1601.8528411513544, "throughput/token_count_per_second_total_recent": 224004.908235859, "throughput/token_count_per_second_total_cum": 209501.62433738142, "throughput/token_count_per_second_update_recent": 393670.0014787171, "throughput/token_count_per_second_update_cum": 392761.1724606317, "throughput/batch_count_per_second_total_recent": 0.1068138638667388, "throughput/batch_count_per_second_total_cum": 0.09989815918797561, "throughput/batch_count_per_second_update_recent": 0.18771648477493147, "throughput/batch_count_per_second_update_cum": 0.1872831213286551, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 650117120, "throughput/token_count": 650117120, "throughput/batch_count": 310, "throughput/flop_count": 0, "throughput/total_time": 3156.3314061540295, "throughput/update_time": 1655.1285377033055, "throughput/token_count_per_second_total_recent": 202209.7272658882, "throughput/token_count_per_second_total_cum": 205972.3889362314, "throughput/token_count_per_second_update_recent": 393672.0946031518, "throughput/token_count_per_second_update_cum": 392789.5055825196, "throughput/batch_count_per_second_total_recent": 0.0964211117104951, "throughput/batch_count_per_second_total_cum": 0.09821528860866137, "throughput/batch_count_per_second_update_recent": 0.18771748285443868, "throughput/batch_count_per_second_update_cum": 0.18729663161397914, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 671088640, "throughput/token_count": 671088640, "throughput/batch_count": 320, "throughput/flop_count": 0, "throughput/total_time": 3209.7035325120087, "throughput/update_time": 1708.4027871834696, "throughput/token_count_per_second_total_recent": 223957.2505818605, "throughput/token_count_per_second_total_cum": 209081.19183044494, "throughput/token_count_per_second_update_recent": 393669.2192296048, "throughput/token_count_per_second_update_cum": 392816.40432487195, "throughput/batch_count_per_second_total_recent": 0.10679113892643953, "throughput/batch_count_per_second_total_cum": 0.0996976813461518, "throughput/batch_count_per_second_update_recent": 0.18771611176948777, "throughput/batch_count_per_second_update_cum": 0.18730945793384168, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 692060160, "throughput/token_count": 692060160, "throughput/batch_count": 330, "throughput/flop_count": 0, "throughput/total_time": 3362.512193232018, "throughput/update_time": 1761.6806279715383, "throughput/token_count_per_second_total_recent": 202263.39734727255, "throughput/token_count_per_second_total_cum": 205816.40161572103, "throughput/token_count_per_second_update_recent": 393662.89193570527, "throughput/token_count_per_second_update_cum": 392840.8753616498, "throughput/batch_count_per_second_total_recent": 0.09644670359958293, "throughput/batch_count_per_second_total_cum": 0.09814090805803348, "throughput/batch_count_per_second_update_recent": 0.1877130946806456, "throughput/batch_count_per_second_update_cum": 0.1873211266334771, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 713031680, "throughput/token_count": 713031680, "throughput/batch_count": 340, "throughput/flop_count": 0, "throughput/total_time": 3415.8845459170407, "throughput/update_time": 1814.9587236176012, "throughput/token_count_per_second_total_recent": 224011.14531810244, "throughput/token_count_per_second_total_cum": 208739.98240141835, "throughput/token_count_per_second_update_recent": 393657.5177105718, "throughput/token_count_per_second_update_cum": 392863.85454473324, "throughput/batch_count_per_second_total_recent": 0.10681683793931124, "throughput/batch_count_per_second_total_cum": 0.09953498001166265, "throughput/batch_count_per_second_update_recent": 0.18771053205040542, "throughput/batch_count_per_second_update_cum": 0.18733208396183645, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 734003200, "throughput/token_count": 734003200, "throughput/batch_count": 350, "throughput/flop_count": 0, "throughput/total_time": 3570.6092982320115, "throughput/update_time": 1868.2295263125561, "throughput/token_count_per_second_total_recent": 201930.6820535624, "throughput/token_count_per_second_total_cum": 205568.05259075586, "throughput/token_count_per_second_update_recent": 393656.65359245194, "throughput/token_count_per_second_update_cum": 392887.0567894026, "throughput/batch_count_per_second_total_recent": 0.09628805258443947, "throughput/batch_count_per_second_total_cum": 0.09802248601472657, "throughput/batch_count_per_second_update_recent": 0.18771012000677678, "throughput/batch_count_per_second_update_cum": 0.18734314765424853, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 754974720, "throughput/token_count": 754974720, "throughput/batch_count": 360, "throughput/flop_count": 0, "throughput/total_time": 3623.980893074011, "throughput/update_time": 1921.4990626386134, "throughput/token_count_per_second_total_recent": 223584.63622403058, "throughput/token_count_per_second_total_cum": 208327.45598710902, "throughput/token_count_per_second_update_recent": 393655.6669344864, "throughput/token_count_per_second_update_cum": 392909.2314847474, "throughput/batch_count_per_second_total_recent": 0.10661346255494622, "throughput/batch_count_per_second_total_cum": 0.09933827208857966, "throughput/batch_count_per_second_update_recent": 0.18770964953159638, "throughput/batch_count_per_second_update_cum": 0.18735372137296075, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 775946240, "throughput/token_count": 775946240, "throughput/batch_count": 370, "throughput/flop_count": 0, "throughput/total_time": 3779.0529326410033, "throughput/update_time": 1974.7656041345908, "throughput/token_count_per_second_total_recent": 201513.24681602887, "throughput/token_count_per_second_total_cum": 205328.22742382903, "throughput/token_count_per_second_update_recent": 393655.66520503844, "throughput/token_count_per_second_update_cum": 392930.8057500049, "throughput/batch_count_per_second_total_recent": 0.09608900395204013, "throughput/batch_count_per_second_total_cum": 0.09790812846366359, "throughput/batch_count_per_second_update_recent": 0.18770964870693133, "throughput/batch_count_per_second_update_cum": 0.18736400878429646, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 796917760, "throughput/token_count": 796917760, "throughput/batch_count": 380, "throughput/flop_count": 0, "throughput/total_time": 3832.4348147350247, "throughput/update_time": 2028.0436954226461, "throughput/token_count_per_second_total_recent": 223056.69048131295, "throughput/token_count_per_second_total_cum": 207940.32997925865, "throughput/token_count_per_second_update_recent": 393653.70848185505, "throughput/token_count_per_second_update_cum": 392949.0088397339, "throughput/batch_count_per_second_total_recent": 0.10636171840730331, "throughput/batch_count_per_second_total_cum": 0.09915367602312977, "throughput/batch_count_per_second_update_recent": 0.18770871566860917, "throughput/batch_count_per_second_update_cum": 0.18737268869387336, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 817889280, "throughput/token_count": 817889280, "throughput/batch_count": 390, "throughput/flop_count": 0, "throughput/total_time": 3986.6730023160344, "throughput/update_time": 2081.339985151775, "throughput/token_count_per_second_total_recent": 201250.4003762484, "throughput/token_count_per_second_total_cum": 205155.84787737846, "throughput/token_count_per_second_update_recent": 393638.87323246064, "throughput/token_count_per_second_update_cum": 392962.8440498912, "throughput/batch_count_per_second_total_recent": 0.09596366900265141, "throughput/batch_count_per_second_total_cum": 0.09782593149060176, "throughput/batch_count_per_second_update_recent": 0.1877016416704467, "throughput/batch_count_per_second_update_cum": 0.18737928583616792, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 838860800, "throughput/token_count": 838860800, "throughput/batch_count": 400, "throughput/flop_count": 0, "throughput/total_time": 4040.0474286440294, "throughput/update_time": 2134.611642122676, "throughput/token_count_per_second_total_recent": 222827.51801880187, "throughput/token_count_per_second_total_cum": 207636.37427928633, "throughput/token_count_per_second_update_recent": 393640.311681268, "throughput/token_count_per_second_update_cum": 392980.52322333894, "throughput/batch_count_per_second_total_recent": 0.10625244046154111, "throughput/batch_count_per_second_total_cum": 0.09900873865093533, "throughput/batch_count_per_second_update_recent": 0.1877023275762882, "throughput/batch_count_per_second_update_cum": 0.1873877159229941, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 859832320, "throughput/token_count": 859832320, "throughput/batch_count": 410, "throughput/flop_count": 0, "throughput/total_time": 4193.527929550037, "throughput/update_time": 2187.884143058851, "throughput/token_count_per_second_total_recent": 201209.9066597902, "throughput/token_count_per_second_total_cum": 205037.93809053264, "throughput/token_count_per_second_update_recent": 393643.553877545, "throughput/token_count_per_second_update_cum": 392997.1898776506, "throughput/batch_count_per_second_total_recent": 0.0959443600939704, "throughput/batch_count_per_second_total_cum": 0.09776970772291786, "throughput/batch_count_per_second_update_recent": 0.18770387357594728, "throughput/batch_count_per_second_update_cum": 0.18739566320307283, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 880803840, "throughput/token_count": 880803840, "throughput/batch_count": 420, "throughput/flop_count": 0, "throughput/total_time": 4246.9178847110015, "throughput/update_time": 2241.15703301772, "throughput/token_count_per_second_total_recent": 222662.08136209505, "throughput/token_count_per_second_total_cum": 207398.36839579907, "throughput/token_count_per_second_update_recent": 393644.5116568508, "throughput/token_count_per_second_update_cum": 393012.9959764563, "throughput/batch_count_per_second_total_recent": 0.10617355411629441, "throughput/batch_count_per_second_total_cum": 0.09889524860181764, "throughput/batch_count_per_second_update_recent": 0.18770433028070965, "throughput/batch_count_per_second_update_cum": 0.18740320013830963, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 901775360, "throughput/token_count": 901775360, "throughput/batch_count": 430, "throughput/flop_count": 0, "throughput/total_time": 4400.437362540048, "throughput/update_time": 2294.435530113755, "throughput/token_count_per_second_total_recent": 201068.55046274385, "throughput/token_count_per_second_total_cum": 204928.57543584524, "throughput/token_count_per_second_update_recent": 393647.5453199426, "throughput/token_count_per_second_update_cum": 393027.1076107731, "throughput/batch_count_per_second_total_recent": 0.09587695620667641, "throughput/batch_count_per_second_total_cum": 0.09771755954544317, "throughput/batch_count_per_second_update_recent": 0.18770577684399728, "throughput/batch_count_per_second_update_cum": 0.18740992908991486, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 922746880, "throughput/token_count": 922746880, "throughput/batch_count": 440, "throughput/flop_count": 0, "throughput/total_time": 4453.811728182016, "throughput/update_time": 2347.708159423666, "throughput/token_count_per_second_total_recent": 222950.6718642072, "throughput/token_count_per_second_total_cum": 207181.38446697488, "throughput/token_count_per_second_update_recent": 393647.414881969, "throughput/token_count_per_second_update_cum": 393041.56110550096, "throughput/batch_count_per_second_total_recent": 0.10631116479120598, "throughput/batch_count_per_second_total_cum": 0.09879178260182137, "throughput/batch_count_per_second_update_recent": 0.18770571464632463, "throughput/batch_count_per_second_update_cum": 0.18741682105326699, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 943718400, "throughput/token_count": 943718400, "throughput/batch_count": 450, "throughput/flop_count": 0, "throughput/total_time": 4608.291631601052, "throughput/update_time": 2400.977511668694, "throughput/token_count_per_second_total_recent": 201114.71827869478, "throughput/token_count_per_second_total_cum": 204787.03941575965, "throughput/token_count_per_second_update_recent": 393648.3702242922, "throughput/token_count_per_second_update_cum": 393055.90969242767, "throughput/batch_count_per_second_total_recent": 0.0958989707368349, "throughput/batch_count_per_second_total_cum": 0.09765006991184218, "throughput/batch_count_per_second_update_recent": 0.18770617018904315, "throughput/batch_count_per_second_update_cum": 0.18742366299268134, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 964689920, "throughput/token_count": 964689920, "throughput/batch_count": 460, "throughput/flop_count": 0, "throughput/total_time": 4661.668510478048, "throughput/update_time": 2454.2510100168292, "throughput/token_count_per_second_total_recent": 223089.74738159907, "throughput/token_count_per_second_total_cum": 206940.90921129702, "throughput/token_count_per_second_update_recent": 393646.0275753389, "throughput/token_count_per_second_update_cum": 393068.9713736269, "throughput/batch_count_per_second_total_recent": 0.10637748116569475, "throughput/batch_count_per_second_total_cum": 0.09867711506428577, "throughput/batch_count_per_second_update_recent": 0.18770505312697358, "throughput/batch_count_per_second_update_cum": 0.18742989128762574, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 985661440, "throughput/token_count": 985661440, "throughput/batch_count": 470, "throughput/flop_count": 0, "throughput/total_time": 4816.396098136029, "throughput/update_time": 2507.5231073708273, "throughput/token_count_per_second_total_recent": 201181.0106754253, "throughput/token_count_per_second_total_cum": 204647.08880182347, "throughput/token_count_per_second_update_recent": 393641.10297754814, "throughput/token_count_per_second_update_cum": 393081.69767315907, "throughput/batch_count_per_second_total_recent": 0.09593058141490235, "throughput/batch_count_per_second_total_cum": 0.09758333625880407, "throughput/batch_count_per_second_update_recent": 0.18770270489575774, "throughput/batch_count_per_second_update_cum": 0.1874359596601291, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 1006632960, "throughput/token_count": 1006632960, "throughput/batch_count": 480, "throughput/flop_count": 0, "throughput/total_time": 4869.772359199007, "throughput/update_time": 2560.802637038869, "throughput/token_count_per_second_total_recent": 222967.65321278523, "throughput/token_count_per_second_total_cum": 206710.4755109279, "throughput/token_count_per_second_update_recent": 393641.7374192035, "throughput/token_count_per_second_update_cum": 393092.7535922875, "throughput/batch_count_per_second_total_recent": 0.10631926212920438, "throughput/batch_count_per_second_total_cum": 0.09856723571344753, "throughput/batch_count_per_second_update_recent": 0.18770300742111373, "throughput/batch_count_per_second_update_cum": 0.18744123153318762, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 1027604480, "throughput/token_count": 1027604480, "throughput/batch_count": 490, "throughput/flop_count": 0, "throughput/total_time": 5024.381550224032, "throughput/update_time": 2614.0767399497563, "throughput/token_count_per_second_total_recent": 201109.12142032455, "throughput/token_count_per_second_total_cum": 204523.57563373746, "throughput/token_count_per_second_update_recent": 393658.25357337896, "throughput/token_count_per_second_update_cum": 393104.17490641493, "throughput/batch_count_per_second_total_recent": 0.09589630194679477, "throughput/batch_count_per_second_total_cum": 0.09752444059073327, "throughput/batch_count_per_second_update_recent": 0.1877108829371352, "throughput/batch_count_per_second_update_cum": 0.1874466776401591, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 1048576000, "throughput/token_count": 1048576000, "throughput/batch_count": 500, "throughput/flop_count": 0, "throughput/total_time": 5077.749329403043, "throughput/update_time": 2667.341171991662, "throughput/token_count_per_second_total_recent": 222703.65989854388, "throughput/token_count_per_second_total_cum": 206504.08911053397, "throughput/token_count_per_second_update_recent": 393663.3310787424, "throughput/token_count_per_second_update_cum": 393116.565293762, "throughput/batch_count_per_second_total_recent": 0.10619338030745691, "throughput/batch_count_per_second_total_cum": 0.09846882300879191, "throughput/batch_count_per_second_update_recent": 0.18771330408036346, "throughput/batch_count_per_second_update_cum": 0.18745258583725072, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 1069547520, "throughput/token_count": 1069547520, "throughput/batch_count": 510, "throughput/flop_count": 0, "throughput/total_time": 5232.712933773, "throughput/update_time": 2720.6185927585466, "throughput/token_count_per_second_total_recent": 200822.91948795228, "throughput/token_count_per_second_total_cum": 204396.36829624674, "throughput/token_count_per_second_update_recent": 393659.56233292527, "throughput/token_count_per_second_update_cum": 393126.5936529317, "throughput/batch_count_per_second_total_recent": 0.09575983023069014, "throughput/batch_count_per_second_total_cum": 0.09746378340542161, "throughput/batch_count_per_second_update_recent": 0.18771150700231803, "throughput/batch_count_per_second_update_cum": 0.18745736773153862, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 1090519040, "throughput/token_count": 1090519040, "throughput/batch_count": 520, "throughput/flop_count": 0, "throughput/total_time": 5286.093386778026, "throughput/update_time": 2773.886181908485, "throughput/token_count_per_second_total_recent": 222358.85124450497, "throughput/token_count_per_second_total_cum": 206299.61678839958, "throughput/token_count_per_second_update_recent": 393662.28913976834, "throughput/token_count_per_second_update_cum": 393137.63019998994, "throughput/batch_count_per_second_total_recent": 0.10602896272874116, "throughput/batch_count_per_second_total_cum": 0.09837132300777415, "throughput/batch_count_per_second_update_recent": 0.18771280724514405, "throughput/batch_count_per_second_update_cum": 0.18746263036727426, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 1111490560, "throughput/token_count": 1111490560, "throughput/batch_count": 530, "throughput/flop_count": 0, "throughput/total_time": 5441.220508124039, "throughput/update_time": 2827.1616227625636, "throughput/token_count_per_second_total_recent": 200512.9531126153, "throughput/token_count_per_second_total_cum": 204272.28750249764, "throughput/token_count_per_second_update_recent": 393665.39147001173, "throughput/token_count_per_second_update_cum": 393147.1589918888, "throughput/batch_count_per_second_total_recent": 0.09561202674513593, "throughput/batch_count_per_second_total_cum": 0.0974046170723427, "throughput/batch_count_per_second_update_recent": 0.18771428655148112, "throughput/batch_count_per_second_update_cum": 0.18746717404932442, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 1132462080, "throughput/token_count": 1132462080, "throughput/batch_count": 540, "throughput/flop_count": 0, "throughput/total_time": 5494.618768353015, "throughput/update_time": 2880.4332658784697, "throughput/token_count_per_second_total_recent": 222204.04698418526, "throughput/token_count_per_second_total_cum": 206103.8495559629, "throughput/token_count_per_second_update_recent": 393665.73019525973, "throughput/token_count_per_second_update_cum": 393156.8536633407, "throughput/batch_count_per_second_total_recent": 0.10595514630517257, "throughput/batch_count_per_second_total_cum": 0.09827797391698975, "throughput/batch_count_per_second_update_recent": 0.1877144480682658, "throughput/batch_count_per_second_update_cum": 0.18747179682890924, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 1153433600, "throughput/token_count": 1153433600, "throughput/batch_count": 550, "throughput/flop_count": 0, "throughput/total_time": 5649.622147237009, "throughput/update_time": 2933.705731303431, "throughput/token_count_per_second_total_recent": 200408.06932449192, "throughput/token_count_per_second_total_cum": 204161.19342850134, "throughput/token_count_per_second_update_recent": 393664.0302826699, "throughput/token_count_per_second_update_cum": 393166.08605033305, "throughput/batch_count_per_second_total_recent": 0.0955620142576656, "throughput/batch_count_per_second_total_cum": 0.0973516432898051, "throughput/batch_count_per_second_update_recent": 0.18771363748677725, "throughput/batch_count_per_second_update_cum": 0.18747619917408612, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 1174405120, "throughput/token_count": 1174405120, "throughput/batch_count": 560, "throughput/flop_count": 0, "throughput/total_time": 5703.016606429999, "throughput/update_time": 2986.9915024373913, "throughput/token_count_per_second_total_recent": 222130.6599207603, "throughput/token_count_per_second_total_cum": 205927.00338201533, "throughput/token_count_per_second_update_recent": 393654.8257324209, "throughput/token_count_per_second_update_cum": 393173.2377014407, "throughput/batch_count_per_second_total_recent": 0.10592015262640014, "throughput/batch_count_per_second_total_cum": 0.0981936470899655, "throughput/batch_count_per_second_update_recent": 0.18770924841519399, "throughput/batch_count_per_second_update_cum": 0.18747960934707675, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 1195376640, "throughput/token_count": 1195376640, "throughput/batch_count": 570, "throughput/flop_count": 0, "throughput/total_time": 5856.936125319044, "throughput/update_time": 3040.2605143213877, "throughput/token_count_per_second_total_recent": 200559.62216874372, "throughput/token_count_per_second_total_cum": 204095.89833709932, "throughput/token_count_per_second_update_recent": 393657.7205602768, "throughput/token_count_per_second_update_cum": 393182.3060455128, "throughput/batch_count_per_second_total_recent": 0.09563428028523623, "throughput/batch_count_per_second_total_cum": 0.09732050816397635, "throughput/batch_count_per_second_update_recent": 0.1877106287766823, "throughput/batch_count_per_second_update_cum": 0.1874839334704937, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 1216348160, "throughput/token_count": 1216348160, "throughput/batch_count": 580, "throughput/flop_count": 0, "throughput/total_time": 5910.318671693036, "throughput/update_time": 3093.5336661074543, "throughput/token_count_per_second_total_recent": 222297.30151363992, "throughput/token_count_per_second_total_cum": 205800.7744701813, "throughput/token_count_per_second_update_recent": 393662.4798022441, "throughput/token_count_per_second_update_cum": 393190.535899521, "throughput/batch_count_per_second_total_recent": 0.1059996135299873, "throughput/batch_count_per_second_total_cum": 0.09813345645436349, "throughput/batch_count_per_second_update_recent": 0.18771289816009717, "throughput/batch_count_per_second_update_cum": 0.1874878577706914, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 1237319680, "throughput/token_count": 1237319680, "throughput/batch_count": 590, "throughput/flop_count": 0, "throughput/total_time": 6063.751631618012, "throughput/update_time": 3146.8101018704474, "throughput/token_count_per_second_total_recent": 200785.91235413591, "throughput/token_count_per_second_total_cum": 204051.84037358762, "throughput/token_count_per_second_update_recent": 393659.3586471032, "throughput/token_count_per_second_update_cum": 393198.0767649575, "throughput/batch_count_per_second_total_recent": 0.0957421838541679, "throughput/batch_count_per_second_total_cum": 0.09729949968985921, "throughput/batch_count_per_second_update_recent": 0.18771140987734947, "throughput/batch_count_per_second_update_cum": 0.18749145353553653, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 1258291200, "throughput/token_count": 1258291200, "throughput/batch_count": 600, "throughput/flop_count": 0, "throughput/total_time": 6117.130997166038, "throughput/update_time": 3200.0810677845147, "throughput/token_count_per_second_total_recent": 222658.19034045527, "throughput/token_count_per_second_total_cum": 205699.56742514504, "throughput/token_count_per_second_update_recent": 393655.1334714269, "throughput/token_count_per_second_update_cum": 393206.0386429967, "throughput/batch_count_per_second_total_recent": 0.10617169873259319, "throughput/batch_count_per_second_total_cum": 0.09808519717461826, "throughput/batch_count_per_second_update_recent": 0.18770939515658708, "throughput/batch_count_per_second_update_cum": 0.1874952500548347, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 1279262720, "throughput/token_count": 1279262720, "throughput/batch_count": 610, "throughput/flop_count": 0, "throughput/total_time": 6270.89233304502, "throughput/update_time": 3253.376109398436, "throughput/token_count_per_second_total_recent": 201017.63002890654, "throughput/token_count_per_second_total_cum": 204000.1090847649, "throughput/token_count_per_second_update_recent": 393641.84937802644, "throughput/token_count_per_second_update_cum": 393210.829914326, "throughput/batch_count_per_second_total_recent": 0.09585267545171096, "throughput/batch_count_per_second_total_cum": 0.0972748322891068, "throughput/batch_count_per_second_update_recent": 0.1877030608072407, "throughput/batch_count_per_second_update_cum": 0.18749753471103955, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 1300234240, "throughput/token_count": 1300234240, "throughput/batch_count": 620, "throughput/flop_count": 0, "throughput/total_time": 6324.271360302053, "throughput/update_time": 3306.647373120475, "throughput/token_count_per_second_total_recent": 222982.4798603182, "throughput/token_count_per_second_total_cum": 205594.31528534216, "throughput/token_count_per_second_update_recent": 393640.1946009381, "throughput/token_count_per_second_update_cum": 393218.29432721523, "throughput/batch_count_per_second_total_recent": 0.10632633202567969, "throughput/batch_count_per_second_total_cum": 0.09803500904337986, "throughput/batch_count_per_second_update_recent": 0.18770227174803644, "throughput/batch_count_per_second_update_cum": 0.1875010940204693, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 1321205760, "throughput/token_count": 1321205760, "throughput/batch_count": 630, "throughput/flop_count": 0, "throughput/total_time": 6477.804279661039, "throughput/update_time": 3359.92066662648, "throughput/token_count_per_second_total_recent": 201329.61850513256, "throughput/token_count_per_second_total_cum": 203958.88837646914, "throughput/token_count_per_second_update_recent": 393640.5116592438, "throughput/token_count_per_second_update_cum": 393225.2844905869, "throughput/batch_count_per_second_total_recent": 0.0960014431501067, "throughput/batch_count_per_second_total_cum": 0.09725517672370393, "throughput/batch_count_per_second_update_recent": 0.1877024229332179, "throughput/batch_count_per_second_update_cum": 0.18750442719010682, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 1342177280, "throughput/token_count": 1342177280, "throughput/batch_count": 640, "throughput/flop_count": 0, "throughput/total_time": 6531.185974933032, "throughput/update_time": 3413.19108713849, "throughput/token_count_per_second_total_recent": 223342.8192815302, "throughput/token_count_per_second_total_cum": 205502.8420797284, "throughput/token_count_per_second_update_recent": 393640.9640297061, "throughput/token_count_per_second_update_cum": 393232.38744457124, "throughput/batch_count_per_second_total_recent": 0.10649815525127897, "throughput/batch_count_per_second_total_cum": 0.09799139121996327, "throughput/batch_count_per_second_update_recent": 0.1877026386402636, "throughput/batch_count_per_second_update_cum": 0.18750781414249956, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 1363148800, "throughput/token_count": 1363148800, "throughput/batch_count": 650, "throughput/flop_count": 0, "throughput/total_time": 6684.524030425004, "throughput/update_time": 3466.4617998044705, "throughput/token_count_per_second_total_recent": 201658.39888375954, "throughput/token_count_per_second_total_cum": 203926.0826643076, "throughput/token_count_per_second_update_recent": 393643.17869601777, "throughput/token_count_per_second_update_cum": 393239.23894874303, "throughput/batch_count_per_second_total_recent": 0.09615821785152413, "throughput/batch_count_per_second_total_cum": 0.097239533741144, "throughput/batch_count_per_second_update_recent": 0.18770369467545403, "throughput/batch_count_per_second_update_cum": 0.18751108119427826, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 1384120320, "throughput/token_count": 1384120320, "throughput/batch_count": 660, "throughput/flop_count": 0, "throughput/total_time": 6737.915407613036, "throughput/update_time": 3519.7321525084553, "throughput/token_count_per_second_total_recent": 223482.4386046569, "throughput/token_count_per_second_total_cum": 205422.6324118154, "throughput/token_count_per_second_update_recent": 393654.7660917754, "throughput/token_count_per_second_update_cum": 393245.92327673576, "throughput/batch_count_per_second_total_recent": 0.10656473093254895, "throughput/batch_count_per_second_total_cum": 0.09795314426985521, "throughput/batch_count_per_second_update_recent": 0.18770921997631806, "throughput/batch_count_per_second_update_cum": 0.1875142685302428, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 1405091840, "throughput/token_count": 1405091840, "throughput/batch_count": 670, "throughput/flop_count": 0, "throughput/total_time": 6891.055202779011, "throughput/update_time": 3573.0133047814597, "throughput/token_count_per_second_total_recent": 201810.9467740556, "throughput/token_count_per_second_total_cum": 203900.82485964667, "throughput/token_count_per_second_update_recent": 393645.2253051928, "throughput/token_count_per_second_update_cum": 393251.21966931527, "throughput/batch_count_per_second_total_recent": 0.09623095835402279, "throughput/batch_count_per_second_total_cum": 0.09722748988134702, "throughput/batch_count_per_second_update_recent": 0.187704670574757, "throughput/batch_count_per_second_update_cum": 0.18751679404702914, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 1426063360, "throughput/token_count": 1426063360, "throughput/batch_count": 680, "throughput/flop_count": 0, "throughput/total_time": 6944.4543808570015, "throughput/update_time": 3626.2884129853337, "throughput/token_count_per_second_total_recent": 223547.8840228971, "throughput/token_count_per_second_total_cum": 205352.8300122568, "throughput/token_count_per_second_update_recent": 393644.44082015636, "throughput/token_count_per_second_update_cum": 393257.01587701257, "throughput/batch_count_per_second_total_recent": 0.10659593773980003, "throughput/batch_count_per_second_total_cum": 0.09791985989201393, "throughput/batch_count_per_second_update_recent": 0.18770429650314158, "throughput/batch_count_per_second_update_cum": 0.18751955789423588, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 1447034880, "throughput/token_count": 1447034880, "throughput/batch_count": 690, "throughput/flop_count": 0, "throughput/total_time": 7097.769050646049, "throughput/update_time": 3679.5599112784257, "throughput/token_count_per_second_total_recent": 201831.0921510957, "throughput/token_count_per_second_total_cum": 203871.7898081354, "throughput/token_count_per_second_update_recent": 393647.9668640194, "throughput/token_count_per_second_update_cum": 393263.0300609081, "throughput/batch_count_per_second_total_recent": 0.09624056441836153, "throughput/batch_count_per_second_total_cum": 0.0972136448898961, "throughput/batch_count_per_second_update_recent": 0.18770597785187693, "throughput/batch_count_per_second_update_cum": 0.18752242568059355, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 1468006400, "throughput/token_count": 1468006400, "throughput/batch_count": 700, "throughput/flop_count": 0, "throughput/total_time": 7151.144483595039, "throughput/update_time": 3732.833151328552, "throughput/token_count_per_second_total_recent": 223651.22897925222, "throughput/token_count_per_second_total_cum": 205282.72129973813, "throughput/token_count_per_second_update_recent": 393645.6286875643, "throughput/token_count_per_second_update_cum": 393268.6890860691, "throughput/batch_count_per_second_total_recent": 0.10664521645510303, "throughput/batch_count_per_second_total_cum": 0.09788642945277125, "throughput/batch_count_per_second_update_recent": 0.1877048629224607, "throughput/batch_count_per_second_update_cum": 0.18752512411406952, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 1488977920, "throughput/token_count": 1488977920, "throughput/batch_count": 710, "throughput/flop_count": 0, "throughput/total_time": 7304.954160230001, "throughput/update_time": 3786.1120017025387, "throughput/token_count_per_second_total_recent": 201822.27236521925, "throughput/token_count_per_second_total_cum": 203831.24758076764, "throughput/token_count_per_second_update_recent": 393658.2179795738, "throughput/token_count_per_second_update_cum": 393273.6060978747, "throughput/batch_count_per_second_total_recent": 0.09623635881672823, "throughput/batch_count_per_second_total_cum": 0.09719431284941084, "throughput/batch_count_per_second_update_recent": 0.1877108659646863, "throughput/batch_count_per_second_update_cum": 0.18752746872800574, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 1509949440, "throughput/token_count": 1509949440, "throughput/batch_count": 720, "throughput/flop_count": 0, "throughput/total_time": 7358.337214609026, "throughput/update_time": 3839.3859579174896, "throughput/token_count_per_second_total_recent": 223589.76698567366, "throughput/token_count_per_second_total_cum": 205202.5336651045, "throughput/token_count_per_second_update_recent": 393655.8700469998, "throughput/token_count_per_second_update_cum": 393278.8879654619, "throughput/batch_count_per_second_total_recent": 0.10661590909274753, "throughput/batch_count_per_second_total_cum": 0.09784819300894952, "throughput/batch_count_per_second_update_recent": 0.18770974638319005, "throughput/batch_count_per_second_update_cum": 0.18752998731873602, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 1530920960, "throughput/token_count": 1530920960, "throughput/batch_count": 730, "throughput/flop_count": 0, "throughput/total_time": 7511.621099121054, "throughput/update_time": 3892.6592194504337, "throughput/token_count_per_second_total_recent": 201870.18884808588, "throughput/token_count_per_second_total_cum": 203806.99982046956, "throughput/token_count_per_second_update_recent": 393655.973963077, "throughput/token_count_per_second_update_cum": 393284.0954457184, "throughput/batch_count_per_second_total_recent": 0.09625920717624945, "throughput/batch_count_per_second_total_cum": 0.09718275061629751, "throughput/batch_count_per_second_update_recent": 0.187709795934237, "throughput/batch_count_per_second_update_cum": 0.18753247043882293, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 1551892480, "throughput/token_count": 1551892480, "throughput/batch_count": 740, "throughput/flop_count": 0, "throughput/total_time": 7565.03086849401, "throughput/update_time": 3945.9412919793394, "throughput/token_count_per_second_total_recent": 223595.01166969276, "throughput/token_count_per_second_total_cum": 205140.27067135274, "throughput/token_count_per_second_update_recent": 393645.9529989011, "throughput/token_count_per_second_update_cum": 393288.28412993165, "throughput/batch_count_per_second_total_recent": 0.10661840995297087, "throughput/batch_count_per_second_total_cum": 0.09781850369994771, "throughput/batch_count_per_second_update_recent": 0.1877050175661569, "throughput/batch_count_per_second_update_cum": 0.1875344677590998, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 1572864000, "throughput/token_count": 1572864000, "throughput/batch_count": 750, "throughput/flop_count": 0, "throughput/total_time": 7718.234012908011, "throughput/update_time": 3999.225145033328, "throughput/token_count_per_second_total_recent": 201891.99931495328, "throughput/token_count_per_second_total_cum": 203785.47701061342, "throughput/token_count_per_second_update_recent": 393636.95860143664, "throughput/token_count_per_second_update_cum": 393292.1861009384, "throughput/batch_count_per_second_total_recent": 0.09626960721728958, "throughput/batch_count_per_second_total_cum": 0.09717248774080917, "throughput/batch_count_per_second_update_recent": 0.1877007287032302, "throughput/batch_count_per_second_update_cum": 0.18753632836386605, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 1593835520, "throughput/token_count": 1593835520, "throughput/batch_count": 760, "throughput/flop_count": 0, "throughput/total_time": 7771.636799781001, "throughput/update_time": 4052.503600837372, "throughput/token_count_per_second_total_recent": 223573.8366515385, "throughput/token_count_per_second_total_cum": 205083.63438251684, "throughput/token_count_per_second_update_recent": 393631.1160966386, "throughput/token_count_per_second_update_cum": 393296.5092666826, "throughput/batch_count_per_second_total_recent": 0.10660831291748929, "throughput/batch_count_per_second_total_cum": 0.09779149741292803, "throughput/batch_count_per_second_update_recent": 0.18769794277984553, "throughput/batch_count_per_second_update_cum": 0.18753838980993395, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 1614807040, "throughput/token_count": 1614807040, "throughput/batch_count": 770, "throughput/flop_count": 0, "throughput/total_time": 7924.894128009037, "throughput/update_time": 4105.784444000397, "throughput/token_count_per_second_total_recent": 201867.20823921694, "throughput/token_count_per_second_total_cum": 203763.86282471212, "throughput/token_count_per_second_update_recent": 393632.4430935613, "throughput/token_count_per_second_update_cum": 393300.4915442277, "throughput/batch_count_per_second_total_recent": 0.09625778591118667, "throughput/batch_count_per_second_total_cum": 0.09716218129382712, "throughput/batch_count_per_second_update_recent": 0.18769857554128708, "throughput/batch_count_per_second_update_cum": 0.18754028870784173, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 1635778560, "throughput/token_count": 1635778560, "throughput/batch_count": 780, "throughput/flop_count": 0, "throughput/total_time": 7978.290488699044, "throughput/update_time": 4159.058140857378, "throughput/token_count_per_second_total_recent": 223591.73387022267, "throughput/token_count_per_second_total_cum": 205028.7041211423, "throughput/token_count_per_second_update_recent": 393632.4957632885, "throughput/token_count_per_second_update_cum": 393305.0475853143, "throughput/batch_count_per_second_total_recent": 0.10661684697638639, "throughput/batch_count_per_second_total_cum": 0.09776530462319484, "throughput/batch_count_per_second_update_recent": 0.18769860065617014, "throughput/batch_count_per_second_update_cum": 0.18754246119752613, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 1656750080, "throughput/token_count": 1656750080, "throughput/batch_count": 790, "throughput/flop_count": 0, "throughput/total_time": 8133.211715042999, "throughput/update_time": 4212.332782215439, "throughput/token_count_per_second_total_recent": 201551.7939408621, "throughput/token_count_per_second_total_cum": 203701.82629522771, "throughput/token_count_per_second_update_recent": 393630.390665461, "throughput/token_count_per_second_update_cum": 393309.40019621316, "throughput/batch_count_per_second_total_recent": 0.0961073846535025, "throughput/batch_count_per_second_total_cum": 0.09713259997140299, "throughput/batch_count_per_second_update_recent": 0.18769759686730433, "throughput/batch_count_per_second_update_cum": 0.1875445366841379, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 1677721600, "throughput/token_count": 1677721600, "throughput/batch_count": 800, "throughput/flop_count": 0, "throughput/total_time": 8186.5969784220215, "throughput/update_time": 4265.60545003548, "throughput/token_count_per_second_total_recent": 223322.01440008092, "throughput/token_count_per_second_total_cum": 204935.1646871205, "throughput/token_count_per_second_update_recent": 393630.70924910577, "throughput/token_count_per_second_update_cum": 393313.8260562859, "throughput/batch_count_per_second_total_recent": 0.10648823471073195, "throughput/batch_count_per_second_total_cum": 0.09772070154529595, "throughput/batch_count_per_second_update_recent": 0.18769774877982415, "throughput/batch_count_per_second_update_cum": 0.18754664709867758, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 1698693120, "throughput/token_count": 1698693120, "throughput/batch_count": 810, "throughput/flop_count": 0, "throughput/total_time": 8342.015340365004, "throughput/update_time": 4318.878572056361, "throughput/token_count_per_second_total_recent": 201236.804156679, "throughput/token_count_per_second_total_cum": 203631.023282879, "throughput/token_count_per_second_update_recent": 393635.6973202118, "throughput/token_count_per_second_update_cum": 393318.101367966, "throughput/batch_count_per_second_total_recent": 0.09595718581994962, "throughput/batch_count_per_second_total_cum": 0.0970988384642024, "throughput/batch_count_per_second_update_recent": 0.18770012727747526, "throughput/batch_count_per_second_update_cum": 0.18754868572614955, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 1719664640, "throughput/token_count": 1719664640, "throughput/batch_count": 820, "throughput/flop_count": 0, "throughput/total_time": 8395.40680388402, "throughput/update_time": 4372.159924876352, "throughput/token_count_per_second_total_recent": 222808.8466094595, "throughput/token_count_per_second_total_cum": 204833.98603203133, "throughput/token_count_per_second_update_recent": 393629.9478801292, "throughput/token_count_per_second_update_cum": 393321.5320454302, "throughput/batch_count_per_second_total_recent": 0.10624353723977065, "throughput/batch_count_per_second_total_cum": 0.09767245580293242, "throughput/batch_count_per_second_update_recent": 0.18769738573080502, "throughput/batch_count_per_second_update_cum": 0.1875503216006423, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 1740636160, "throughput/token_count": 1740636160, "throughput/batch_count": 830, "throughput/flop_count": 0, "throughput/total_time": 8550.23874416505, "throughput/update_time": 4425.43352887634, "throughput/token_count_per_second_total_recent": 200933.56261130192, "throughput/token_count_per_second_total_cum": 203577.49205399258, "throughput/token_count_per_second_update_recent": 393629.98171940027, "throughput/token_count_per_second_update_cum": 393325.56881539343, "throughput/batch_count_per_second_total_recent": 0.0958125889832029, "throughput/batch_count_per_second_total_cum": 0.09707331278514508, "throughput/batch_count_per_second_update_recent": 0.18769740186662687, "throughput/batch_count_per_second_update_cum": 0.1875522464825599, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 1761607680, "throughput/token_count": 1761607680, "throughput/batch_count": 840, "throughput/flop_count": 0, "throughput/total_time": 8603.637448858004, "throughput/update_time": 4478.715910169412, "throughput/token_count_per_second_total_recent": 222414.7501849441, "throughput/token_count_per_second_total_cum": 204751.50080084155, "throughput/token_count_per_second_update_recent": 393630.12061253045, "throughput/token_count_per_second_update_cum": 393328.7387128257, "throughput/batch_count_per_second_total_recent": 0.10605561742064672, "throughput/batch_count_per_second_total_cum": 0.09763312377969816, "throughput/batch_count_per_second_update_recent": 0.18769746809603235, "throughput/batch_count_per_second_update_cum": 0.18755375800744328, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 1782579200, "throughput/token_count": 1782579200, "throughput/batch_count": 850, "throughput/flop_count": 0, "throughput/total_time": 8756.833522661007, "throughput/update_time": 4531.9930339534185, "throughput/token_count_per_second_total_recent": 200938.68442783688, "throughput/token_count_per_second_total_cum": 203564.35866766528, "throughput/token_count_per_second_update_recent": 393636.4317302535, "throughput/token_count_per_second_update_cum": 393332.290373137, "throughput/batch_count_per_second_total_recent": 0.09581503125564426, "throughput/batch_count_per_second_total_cum": 0.09706705029853119, "throughput/batch_count_per_second_update_recent": 0.1877004774714725, "throughput/batch_count_per_second_update_cum": 0.18755545157105302, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 1803550720, "throughput/token_count": 1803550720, "throughput/batch_count": 860, "throughput/flop_count": 0, "throughput/total_time": 8810.230298971, "throughput/update_time": 4585.266904477379, "throughput/token_count_per_second_total_recent": 222436.65809879926, "throughput/token_count_per_second_total_cum": 204710.96200636748, "throughput/token_count_per_second_update_recent": 393637.43158690125, "throughput/token_count_per_second_update_cum": 393336.0385714701, "throughput/batch_count_per_second_total_recent": 0.10606606392803157, "throughput/batch_count_per_second_total_cum": 0.09761379337614416, "throughput/batch_count_per_second_update_recent": 0.18770095424027503, "throughput/batch_count_per_second_update_cum": 0.18755723885129458, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 1824522240, "throughput/token_count": 1824522240, "throughput/batch_count": 870, "throughput/flop_count": 0, "throughput/total_time": 8963.212115761009, "throughput/update_time": 4638.541050948435, "throughput/token_count_per_second_total_recent": 200990.67788272206, "throughput/token_count_per_second_total_cum": 203556.7401993913, "throughput/token_count_per_second_update_recent": 393642.30503992847, "throughput/token_count_per_second_update_cum": 393339.6772735131, "throughput/batch_count_per_second_total_recent": 0.09583982366691687, "throughput/batch_count_per_second_total_cum": 0.09706341752976956, "throughput/batch_count_per_second_update_recent": 0.18770327808376716, "throughput/batch_count_per_second_update_cum": 0.1875589739196363, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 1845493760, "throughput/token_count": 1845493760, "throughput/batch_count": 880, "throughput/flop_count": 0, "throughput/total_time": 9016.607044072007, "throughput/update_time": 4691.8174923404, "throughput/token_count_per_second_total_recent": 222896.70452893883, "throughput/token_count_per_second_total_cum": 204677.18632734747, "throughput/token_count_per_second_update_recent": 393639.5824019647, "throughput/token_count_per_second_update_cum": 393343.04094582755, "throughput/batch_count_per_second_total_recent": 0.10628543116042082, "throughput/batch_count_per_second_total_cum": 0.09759768787734388, "throughput/batch_count_per_second_update_recent": 0.1877019798288177, "throughput/batch_count_per_second_update_cum": 0.18756057784358385, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 1866465280, "throughput/token_count": 1866465280, "throughput/batch_count": 890, "throughput/flop_count": 0, "throughput/total_time": 9169.565102360037, "throughput/update_time": 4745.092234898475, "throughput/token_count_per_second_total_recent": 201373.46263191948, "throughput/token_count_per_second_total_cum": 203550.0330893135, "throughput/token_count_per_second_update_recent": 393640.05967854365, "throughput/token_count_per_second_update_cum": 393346.46991112374, "throughput/batch_count_per_second_total_recent": 0.09602234965892767, "throughput/batch_count_per_second_total_cum": 0.09706021933046032, "throughput/batch_count_per_second_update_recent": 0.18770220741202528, "throughput/batch_count_per_second_update_cum": 0.18756221290165126, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 1887436800, "throughput/token_count": 1887436800, "throughput/batch_count": 900, "throughput/flop_count": 0, "throughput/total_time": 9222.955616571999, "throughput/update_time": 4798.3678903255495, "throughput/token_count_per_second_total_recent": 223490.65257616257, "throughput/token_count_per_second_total_cum": 204645.54731333785, "throughput/token_count_per_second_update_recent": 393637.23850083165, "throughput/token_count_per_second_update_cum": 393349.7479018736, "throughput/batch_count_per_second_total_recent": 0.10656864765937928, "throughput/batch_count_per_second_total_cum": 0.09758260121981518, "throughput/batch_count_per_second_update_recent": 0.1877008621696623, "throughput/batch_count_per_second_update_cum": 0.18756377596944504, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 1908408320, "throughput/token_count": 1908408320, "throughput/batch_count": 910, "throughput/flop_count": 0, "throughput/total_time": 9376.841395194002, "throughput/update_time": 4851.6424762834795, "throughput/token_count_per_second_total_recent": 201672.90935025286, "throughput/token_count_per_second_total_cum": 203523.5789503845, "throughput/token_count_per_second_update_recent": 393638.37501931813, "throughput/token_count_per_second_update_cum": 393353.04061026045, "throughput/batch_count_per_second_total_recent": 0.09616513698113101, "throughput/batch_count_per_second_total_cum": 0.09704760501403069, "throughput/batch_count_per_second_update_recent": 0.18770140410390765, "throughput/batch_count_per_second_update_cum": 0.187565346055155, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 1929379840, "throughput/token_count": 1929379840, "throughput/batch_count": 920, "throughput/flop_count": 0, "throughput/total_time": 9430.23481100105, "throughput/update_time": 4904.91280556639, "throughput/token_count_per_second_total_recent": 223717.1512105476, "throughput/token_count_per_second_total_cum": 204595.1006171383, "throughput/token_count_per_second_update_recent": 393645.021782827, "throughput/token_count_per_second_update_cum": 393356.6031613088, "throughput/batch_count_per_second_total_recent": 0.10667665062453632, "throughput/batch_count_per_second_total_cum": 0.09755854636055865, "throughput/batch_count_per_second_update_recent": 0.18770457352773046, "throughput/batch_count_per_second_update_cum": 0.18756704481187286, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 1950351360, "throughput/token_count": 1950351360, "throughput/batch_count": 930, "throughput/flop_count": 0, "throughput/total_time": 9583.694742296997, "throughput/update_time": 4958.171936342376, "throughput/token_count_per_second_total_recent": 201942.07435343356, "throughput/token_count_per_second_total_cum": 203507.24980755642, "throughput/token_count_per_second_update_recent": 393656.8016314699, "throughput/token_count_per_second_update_cum": 393360.97760231496, "throughput/batch_count_per_second_total_recent": 0.09629348485633543, "throughput/batch_count_per_second_total_cum": 0.09703981867196866, "throughput/batch_count_per_second_update_recent": 0.18771019059728142, "throughput/batch_count_per_second_update_cum": 0.18756913070789097, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 1971322880, "throughput/token_count": 1971322880, "throughput/batch_count": 940, "throughput/flop_count": 0, "throughput/total_time": 9637.072636537021, "throughput/update_time": 5011.441395019239, "throughput/token_count_per_second_total_recent": 223658.9258136621, "throughput/token_count_per_second_total_cum": 204556.1919421595, "throughput/token_count_per_second_update_recent": 393665.29887606046, "throughput/token_count_per_second_update_cum": 393364.4483918847, "throughput/batch_count_per_second_total_recent": 0.10664888659175019, "throughput/batch_count_per_second_total_cum": 0.09753999325855231, "throughput/batch_count_per_second_update_recent": 0.18771424239924453, "throughput/batch_count_per_second_update_cum": 0.18757078570932612, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 1992294400, "throughput/token_count": 1992294400, "throughput/batch_count": 950, "throughput/flop_count": 0, "throughput/total_time": 9791.05162328505, "throughput/update_time": 5064.716082916246, "throughput/token_count_per_second_total_recent": 201792.02060455224, "throughput/token_count_per_second_total_cum": 203481.14550452694, "throughput/token_count_per_second_update_recent": 393667.5939165046, "throughput/token_count_per_second_update_cum": 393367.44002693315, "throughput/batch_count_per_second_total_recent": 0.09622193365314113, "throughput/batch_count_per_second_total_cum": 0.09702737117029521, "throughput/batch_count_per_second_update_recent": 0.18771533675980787, "throughput/batch_count_per_second_update_cum": 0.187572212232081, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 2013265920, "throughput/token_count": 2013265920, "throughput/batch_count": 960, "throughput/flop_count": 0, "throughput/total_time": 9844.444105764036, "throughput/update_time": 5117.988080174255, "throughput/token_count_per_second_total_recent": 223422.13334760707, "throughput/token_count_per_second_total_cum": 204507.83186642398, "throughput/token_count_per_second_update_recent": 393668.6435213183, "throughput/token_count_per_second_update_cum": 393370.57618380646, "throughput/batch_count_per_second_total_recent": 0.1065359751451526, "throughput/batch_count_per_second_total_cum": 0.0975169333774681, "throughput/batch_count_per_second_update_recent": 0.1877158372503845, "throughput/batch_count_per_second_update_cum": 0.18757370766821216, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 2034237440, "throughput/token_count": 2034237440, "throughput/batch_count": 970, "throughput/flop_count": 0, "throughput/total_time": 9997.457464576, "throughput/update_time": 5171.263283661159, "throughput/token_count_per_second_total_recent": 201787.347624926, "throughput/token_count_per_second_total_cum": 203475.47836116486, "throughput/token_count_per_second_update_recent": 393669.1698956946, "throughput/token_count_per_second_update_cum": 393373.4038309876, "throughput/batch_count_per_second_total_recent": 0.09621970540281582, "throughput/batch_count_per_second_total_cum": 0.0970246688657593, "throughput/batch_count_per_second_update_recent": 0.18771608824524622, "throughput/batch_count_per_second_update_cum": 0.1875750559954584, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 2055208960, "throughput/token_count": 2055208960, "throughput/batch_count": 980, "throughput/flop_count": 0, "throughput/total_time": 10050.842236216005, "throughput/update_time": 5224.530718925176, "throughput/token_count_per_second_total_recent": 223406.85120679133, "throughput/token_count_per_second_total_cum": 204481.26750955312, "throughput/token_count_per_second_update_recent": 393675.06797216856, "throughput/token_count_per_second_update_cum": 393376.7587116055, "throughput/batch_count_per_second_total_recent": 0.10652868805255476, "throughput/batch_count_per_second_total_cum": 0.0975042665050283, "throughput/batch_count_per_second_update_recent": 0.18771890066727093, "throughput/batch_count_per_second_update_cum": 0.18757665572719837, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} diff --git a/metrics/jsonlines/train.jsonl b/metrics/jsonlines/train.jsonl index d709f21715afdaff3708124b71fd540e3cf6ff37..e45db577fe11b808e8b3f98d805235f9fbc81312 100644 --- a/metrics/jsonlines/train.jsonl +++ b/metrics/jsonlines/train.jsonl @@ -1,98 +1,98 @@ -{"step": 20971520, "train/token_count": 20971520, "train/batch_count": 10, "train/flop_count": 0, "train/total_time": 72.8780845789588, "train/update_time": 72.70853926707059, "train/lr": 0.0009000000000000001, "train/loss": 9.761818885803223, "train/global_grad_norm": 1.2346543073654175} -{"step": 41943040, "train/token_count": 41943040, "train/batch_count": 20, "train/flop_count": 0, "train/total_time": 141.98572698398493, "train/update_time": 141.70471469813492, "train/lr": 0.0009997960964140947, "train/loss": 8.126626014709473, "train/global_grad_norm": 0.962840735912323} -{"step": 62914560, "train/token_count": 62914560, "train/batch_count": 30, "train/flop_count": 0, "train/total_time": 301.07908088195836, "train/update_time": 210.67360014707083, "train/lr": 0.0009990914580222257, "train/loss": 7.519838333129883, "train/global_grad_norm": 0.5704449415206909} -{"step": 83886080, "train/token_count": 83886080, "train/batch_count": 40, "train/flop_count": 0, "train/total_time": 370.1664184979745, "train/update_time": 279.6480940769543, "train/lr": 0.0009978842768382998, "train/loss": 7.193228244781494, "train/global_grad_norm": 0.4210163950920105} -{"step": 104857600, "train/token_count": 104857600, "train/batch_count": 50, "train/flop_count": 0, "train/total_time": 529.2283039629692, "train/update_time": 348.5810220290441, "train/lr": 0.0009961757683914405, "train/loss": 6.9471588134765625, "train/global_grad_norm": 0.26851552724838257} -{"step": 125829120, "train/token_count": 125829120, "train/batch_count": 60, "train/flop_count": 0, "train/total_time": 598.2814449759899, "train/update_time": 417.52457091695396, "train/lr": 0.00099396765300483, "train/loss": 6.682523727416992, "train/global_grad_norm": 0.37517017126083374} -{"step": 146800640, "train/token_count": 146800640, "train/batch_count": 70, "train/flop_count": 0, "train/total_time": 757.2756010189769, "train/update_time": 486.45154978276696, "train/lr": 0.0009912621540634887, "train/loss": 6.482426166534424, "train/global_grad_norm": 0.303166002035141} -{"step": 167772160, "train/token_count": 167772160, "train/batch_count": 80, "train/flop_count": 0, "train/total_time": 826.3311064429581, "train/update_time": 555.3910710238852, "train/lr": 0.000988061995775515, "train/loss": 6.281425952911377, "train/global_grad_norm": 0.3328990936279297} -{"step": 188743680, "train/token_count": 188743680, "train/batch_count": 90, "train/flop_count": 0, "train/total_time": 985.3252943049883, "train/update_time": 624.3266040377785, "train/lr": 0.0009843704004290394, "train/loss": 6.091310977935791, "train/global_grad_norm": 0.3429378867149353} -{"step": 209715200, "train/token_count": 209715200, "train/batch_count": 100, "train/flop_count": 0, "train/total_time": 1054.376383124967, "train/update_time": 693.2531012066174, "train/lr": 0.0009801910851476522, "train/loss": 5.976389408111572, "train/global_grad_norm": 0.5708628296852112} -{"step": 230686720, "train/token_count": 230686720, "train/batch_count": 110, "train/flop_count": 0, "train/total_time": 1213.9692550019827, "train/update_time": 762.1767162576434, "train/lr": 0.0009755282581475768, "train/loss": 5.850888252258301, "train/global_grad_norm": 0.38383719325065613} -{"step": 251658240, "train/token_count": 251658240, "train/batch_count": 120, "train/flop_count": 0, "train/total_time": 1283.0319452389958, "train/update_time": 831.1140817146515, "train/lr": 0.0009703866145003512, "train/loss": 5.717982769012451, "train/global_grad_norm": 0.5857133865356445} -{"step": 272629760, "train/token_count": 272629760, "train/batch_count": 130, "train/flop_count": 0, "train/total_time": 1442.136592313007, "train/update_time": 900.0497305926983, "train/lr": 0.0009647713314052896, "train/loss": 5.650483131408691, "train/global_grad_norm": 0.3780403137207031} -{"step": 293601280, "train/token_count": 293601280, "train/batch_count": 140, "train/flop_count": 0, "train/total_time": 1511.2121625279542, "train/update_time": 969.0007961746887, "train/lr": 0.0009586880629764817, "train/loss": 5.568375587463379, "train/global_grad_norm": 0.4291097819805145} -{"step": 314572800, "train/token_count": 314572800, "train/batch_count": 150, "train/flop_count": 0, "train/total_time": 1670.3140526569914, "train/update_time": 1037.9416665838216, "train/lr": 0.0009521429345495787, "train/loss": 5.447340965270996, "train/global_grad_norm": 0.3828825354576111} -{"step": 335544320, "train/token_count": 335544320, "train/batch_count": 160, "train/flop_count": 0, "train/total_time": 1739.393865599006, "train/update_time": 1106.8950568859, "train/lr": 0.0009451425365140996, "train/loss": 5.406825065612793, "train/global_grad_norm": 0.570035994052887} -{"step": 356515840, "train/token_count": 356515840, "train/batch_count": 170, "train/flop_count": 0, "train/total_time": 1898.5316135869944, "train/update_time": 1175.8391638009343, "train/lr": 0.000937693917677468, "train/loss": 5.300189971923828, "train/global_grad_norm": 0.39234593510627747} -{"step": 377487360, "train/token_count": 377487360, "train/batch_count": 180, "train/flop_count": 0, "train/total_time": 1967.6012063919916, "train/update_time": 1244.7901415458764, "train/lr": 0.0009298045781674596, "train/loss": 5.269626617431641, "train/global_grad_norm": 0.5113462209701538} -{"step": 398458880, "train/token_count": 398458880, "train/batch_count": 190, "train/flop_count": 0, "train/total_time": 2127.1002123509534, "train/update_time": 1313.7616025957977, "train/lr": 0.0009214824618802108, "train/loss": 5.244931221008301, "train/global_grad_norm": 0.5057875514030457} -{"step": 419430400, "train/token_count": 419430400, "train/batch_count": 200, "train/flop_count": 0, "train/total_time": 2196.1893687059637, "train/update_time": 1382.727176492801, "train/lr": 0.000912735948481387, "train/loss": 5.148478984832764, "train/global_grad_norm": 0.4193888008594513} -{"step": 440401920, "train/token_count": 440401920, "train/batch_count": 210, "train/flop_count": 0, "train/total_time": 2356.148066883965, "train/update_time": 1451.6794419176877, "train/lr": 0.0009035738449685707, "train/loss": 5.105681896209717, "train/global_grad_norm": 0.4414325952529907} -{"step": 461373440, "train/token_count": 461373440, "train/batch_count": 220, "train/flop_count": 0, "train/total_time": 2425.217230288952, "train/update_time": 1520.627368493646, "train/lr": 0.0008940053768033609, "train/loss": 5.069815635681152, "train/global_grad_norm": 0.45171600580215454} -{"step": 482344960, "train/token_count": 482344960, "train/batch_count": 230, "train/flop_count": 0, "train/total_time": 2584.4522528109956, "train/update_time": 1589.588658401568, "train/lr": 0.0008840401786221159, "train/loss": 5.012455940246582, "train/global_grad_norm": 0.4408389925956726} -{"step": 503316480, "train/token_count": 503316480, "train/batch_count": 240, "train/flop_count": 0, "train/total_time": 2653.526103938988, "train/update_time": 1658.537395758729, "train/lr": 0.0008736882845346905, "train/loss": 4.963330268859863, "train/global_grad_norm": 0.5382868647575378} -{"step": 524288000, "train/token_count": 524288000, "train/batch_count": 250, "train/flop_count": 0, "train/total_time": 2812.5629005369847, "train/update_time": 1727.488953433698, "train/lr": 0.0008629601180209381, "train/loss": 4.960586071014404, "train/global_grad_norm": 0.4774056375026703} -{"step": 545259520, "train/token_count": 545259520, "train/batch_count": 260, "train/flop_count": 0, "train/total_time": 2881.6501679039793, "train/update_time": 1796.4439407095779, "train/lr": 0.0008518664814351503, "train/loss": 4.907783031463623, "train/global_grad_norm": 0.42411527037620544} -{"step": 566231040, "train/token_count": 566231040, "train/batch_count": 270, "train/flop_count": 0, "train/total_time": 3040.8488886230043, "train/update_time": 1865.3935032716836, "train/lr": 0.0008404185451290017, "train/loss": 4.902004718780518, "train/global_grad_norm": 0.6527204513549805} -{"step": 587202560, "train/token_count": 587202560, "train/batch_count": 280, "train/flop_count": 0, "train/total_time": 3109.932256244996, "train/update_time": 1934.345054808713, "train/lr": 0.0008286278362039527, "train/loss": 4.846382141113281, "train/global_grad_norm": 0.5287019610404968} -{"step": 608174080, "train/token_count": 608174080, "train/batch_count": 290, "train/flop_count": 0, "train/total_time": 3268.990683123993, "train/update_time": 2003.3044974627555, "train/lr": 0.0008165062269044352, "train/loss": 4.817776203155518, "train/global_grad_norm": 0.5458475351333618} -{"step": 629145600, "train/token_count": 629145600, "train/batch_count": 300, "train/flop_count": 0, "train/total_time": 3338.075166533992, "train/update_time": 2072.2588375147316, "train/lr": 0.0008040659226635089, "train/loss": 4.793833255767822, "train/global_grad_norm": 0.47956281900405884} -{"step": 650117120, "train/token_count": 650117120, "train/batch_count": 310, "train/flop_count": 0, "train/total_time": 3497.58566847397, "train/update_time": 2141.20914719766, "train/lr": 0.0007913194498130252, "train/loss": 4.808987140655518, "train/global_grad_norm": 0.4503716826438904} -{"step": 671088640, "train/token_count": 671088640, "train/batch_count": 320, "train/flop_count": 0, "train/total_time": 3566.6879564279807, "train/update_time": 2210.160061070579, "train/lr": 0.000778279642970672, "train/loss": 4.740894317626953, "train/global_grad_norm": 0.4691788852214813} -{"step": 692060160, "train/token_count": 692060160, "train/batch_count": 330, "train/flop_count": 0, "train/total_time": 3725.7928462160053, "train/update_time": 2279.117056379677, "train/lr": 0.0007649596321166025, "train/loss": 4.758164882659912, "train/global_grad_norm": 0.48899734020233154} -{"step": 713031680, "train/token_count": 713031680, "train/batch_count": 340, "train/flop_count": 0, "train/total_time": 3794.8868216549745, "train/update_time": 2348.075175291684, "train/lr": 0.0007513728293726579, "train/loss": 4.721774578094482, "train/global_grad_norm": 0.46350735425949097} -{"step": 734003200, "train/token_count": 734003200, "train/batch_count": 350, "train/flop_count": 0, "train/total_time": 3954.2480487469584, "train/update_time": 2417.024196324637, "train/lr": 0.0007375329154974975, "train/loss": 4.702852725982666, "train/global_grad_norm": 0.45745885372161865} -{"step": 754974720, "train/token_count": 754974720, "train/batch_count": 360, "train/flop_count": 0, "train/total_time": 4023.3350287670037, "train/update_time": 2485.9809508775943, "train/lr": 0.0007234538261112341, "train/loss": 4.632237434387207, "train/global_grad_norm": 0.6149536967277527} -{"step": 775946240, "train/token_count": 775946240, "train/batch_count": 370, "train/flop_count": 0, "train/total_time": 4182.707546444959, "train/update_time": 2554.9301924086176, "train/lr": 0.0007091497376634464, "train/loss": 4.653685092926025, "train/global_grad_norm": 0.45167258381843567} -{"step": 796917760, "train/token_count": 796917760, "train/batch_count": 380, "train/flop_count": 0, "train/total_time": 4251.801609379996, "train/update_time": 2623.8855139956577, "train/lr": 0.0006946350531586958, "train/loss": 4.6325860023498535, "train/global_grad_norm": 0.473895788192749} -{"step": 817889280, "train/token_count": 817889280, "train/batch_count": 390, "train/flop_count": 0, "train/total_time": 4410.926594229008, "train/update_time": 2692.833996849775, "train/lr": 0.0006799243876539214, "train/loss": 4.6405487060546875, "train/global_grad_norm": 0.6041337847709656} -{"step": 838860800, "train/token_count": 838860800, "train/batch_count": 400, "train/flop_count": 0, "train/total_time": 4480.02484513697, "train/update_time": 2761.798597707704, "train/lr": 0.0006650325535423166, "train/loss": 4.547246932983398, "train/global_grad_norm": 0.5259911417961121} -{"step": 859832320, "train/token_count": 859832320, "train/batch_count": 410, "train/flop_count": 0, "train/total_time": 4639.57663258299, "train/update_time": 2830.754078882688, "train/lr": 0.0006499745456385053, "train/loss": 4.569416522979736, "train/global_grad_norm": 0.5999050140380859} -{"step": 880803840, "train/token_count": 880803840, "train/batch_count": 420, "train/flop_count": 0, "train/total_time": 4708.671424973989, "train/update_time": 2899.7122086867457, "train/lr": 0.0006347655260800339, "train/loss": 4.56511926651001, "train/global_grad_norm": 0.47612109780311584} -{"step": 901775360, "train/token_count": 901775360, "train/batch_count": 430, "train/flop_count": 0, "train/total_time": 4867.7282030819915, "train/update_time": 2968.665082120744, "train/lr": 0.0006194208090603844, "train/loss": 4.56137228012085, "train/global_grad_norm": 0.555321216583252} -{"step": 922746880, "train/token_count": 922746880, "train/batch_count": 440, "train/flop_count": 0, "train/total_time": 4936.824467270984, "train/update_time": 3037.629162015859, "train/lr": 0.0006039558454088796, "train/loss": 4.581612586975098, "train/global_grad_norm": 0.4930824935436249} -{"step": 943718400, "train/token_count": 943718400, "train/batch_count": 450, "train/flop_count": 0, "train/total_time": 5096.086593801971, "train/update_time": 3106.58031973982, "train/lr": 0.0005883862070330078, "train/loss": 4.533069610595703, "train/global_grad_norm": 0.6734046339988708} -{"step": 964689920, "train/token_count": 964689920, "train/batch_count": 460, "train/flop_count": 0, "train/total_time": 5165.18791128695, "train/update_time": 3175.5471031158813, "train/lr": 0.0005727275712388317, "train/loss": 4.493007183074951, "train/global_grad_norm": 0.4193324148654938} -{"step": 985661440, "train/token_count": 985661440, "train/batch_count": 470, "train/flop_count": 0, "train/total_time": 5324.26284656598, "train/update_time": 3244.498165418743, "train/lr": 0.0005569957049452703, "train/loss": 4.517164707183838, "train/global_grad_norm": 0.4578356444835663} -{"step": 1006632960, "train/token_count": 1006632960, "train/batch_count": 480, "train/flop_count": 0, "train/total_time": 5393.351182107988, "train/update_time": 3313.452428144694, "train/lr": 0.0005412064488081482, "train/loss": 4.49570369720459, "train/global_grad_norm": 0.5219614505767822} -{"step": 1027604480, "train/token_count": 1027604480, "train/batch_count": 490, "train/flop_count": 0, "train/total_time": 5552.3678305439535, "train/update_time": 3382.4195148196886, "train/lr": 0.0005253757012699972, "train/loss": 4.4889116287231445, "train/global_grad_norm": 0.3808702528476715} -{"step": 1048576000, "train/token_count": 1048576000, "train/batch_count": 500, "train/flop_count": 0, "train/total_time": 5621.47650762595, "train/update_time": 3451.396044731722, "train/lr": 0.0005095194025516734, "train/loss": 4.463628768920898, "train/global_grad_norm": 0.52295982837677} -{"step": 1069547520, "train/token_count": 1069547520, "train/batch_count": 510, "train/flop_count": 0, "train/total_time": 5780.910811659007, "train/update_time": 3520.3645316287293, "train/lr": 0.0004936535186019053, "train/loss": 4.463111877441406, "train/global_grad_norm": 0.46243464946746826} -{"step": 1090519040, "train/token_count": 1090519040, "train/batch_count": 520, "train/flop_count": 0, "train/total_time": 5850.004313221958, "train/update_time": 3589.3208626466803, "train/lr": 0.00047779402502093696, "train/loss": 4.4545392990112305, "train/global_grad_norm": 0.5457447171211243} -{"step": 1111490560, "train/token_count": 1111490560, "train/batch_count": 530, "train/flop_count": 0, "train/total_time": 6009.048461712955, "train/update_time": 3658.2837809736375, "train/lr": 0.0004619568909744525, "train/loss": 4.413477420806885, "train/global_grad_norm": 0.4806564152240753} -{"step": 1132462080, "train/token_count": 1132462080, "train/batch_count": 540, "train/flop_count": 0, "train/total_time": 6078.168775815982, "train/update_time": 3727.2589657856734, "train/lr": 0.00044615806311398067, "train/loss": 4.425002098083496, "train/global_grad_norm": 0.5740962028503418} -{"step": 1153433600, "train/token_count": 1153433600, "train/batch_count": 550, "train/flop_count": 0, "train/total_time": 6237.341601410008, "train/update_time": 3796.2288036436657, "train/lr": 0.0004304134495199673, "train/loss": 4.370032787322998, "train/global_grad_norm": 0.46423137187957764} -{"step": 1174405120, "train/token_count": 1174405120, "train/batch_count": 560, "train/flop_count": 0, "train/total_time": 6306.462939933001, "train/update_time": 3865.200992291735, "train/lr": 0.0004147389036836882, "train/loss": 4.4128217697143555, "train/global_grad_norm": 0.5103574991226196} -{"step": 1195376640, "train/token_count": 1195376640, "train/batch_count": 570, "train/flop_count": 0, "train/total_time": 6465.5906408529845, "train/update_time": 3934.16611681378, "train/lr": 0.0003991502085441259, "train/loss": 4.360372066497803, "train/global_grad_norm": 0.38778555393218994} -{"step": 1216348160, "train/token_count": 1216348160, "train/batch_count": 580, "train/flop_count": 0, "train/total_time": 6534.709841756965, "train/update_time": 4003.142038117745, "train/lr": 0.0003836630605958888, "train/loss": 4.410104751586914, "train/global_grad_norm": 0.5044620633125305} -{"step": 1237319680, "train/token_count": 1237319680, "train/batch_count": 590, "train/flop_count": 0, "train/total_time": 6694.0332092359895, "train/update_time": 4072.1044038116815, "train/lr": 0.00036829305408417155, "train/loss": 4.391136646270752, "train/global_grad_norm": 0.47766736149787903} -{"step": 1258291200, "train/token_count": 1258291200, "train/batch_count": 600, "train/flop_count": 0, "train/total_time": 6763.143532235001, "train/update_time": 4141.075749416603, "train/lr": 0.000353055665302672, "train/loss": 4.387944221496582, "train/global_grad_norm": 0.4765089154243469} -{"step": 1279262720, "train/token_count": 1279262720, "train/batch_count": 610, "train/flop_count": 0, "train/total_time": 6922.65050104697, "train/update_time": 4210.039972436498, "train/lr": 0.0003379662370102746, "train/loss": 4.3538899421691895, "train/global_grad_norm": 0.41657164692878723} -{"step": 1300234240, "train/token_count": 1300234240, "train/batch_count": 620, "train/flop_count": 0, "train/total_time": 6991.763516802981, "train/update_time": 4279.012207661464, "train/lr": 0.00032303996298219405, "train/loss": 4.328833103179932, "train/global_grad_norm": 0.44749119877815247} -{"step": 1321205760, "train/token_count": 1321205760, "train/batch_count": 630, "train/flop_count": 0, "train/total_time": 7150.874404011003, "train/update_time": 4347.981613874435, "train/lr": 0.00030829187271113034, "train/loss": 4.339343070983887, "train/global_grad_norm": 0.4195193946361542} -{"step": 1342177280, "train/token_count": 1342177280, "train/batch_count": 640, "train/flop_count": 0, "train/total_time": 7219.974368761992, "train/update_time": 4416.951972602401, "train/lr": 0.0002937368162738445, "train/loss": 4.329409599304199, "train/global_grad_norm": 0.43621429800987244} -{"step": 1363148800, "train/token_count": 1363148800, "train/batch_count": 650, "train/flop_count": 0, "train/total_time": 7379.584015795961, "train/update_time": 4485.922716939414, "train/lr": 0.0002793894493783894, "train/loss": 4.303433418273926, "train/global_grad_norm": 0.4920794367790222} -{"step": 1384120320, "train/token_count": 1384120320, "train/batch_count": 660, "train/flop_count": 0, "train/total_time": 7448.697525598982, "train/update_time": 4554.898137903423, "train/lr": 0.00026526421860705474, "train/loss": 4.324357986450195, "train/global_grad_norm": 0.39209991693496704} -{"step": 1405091840, "train/token_count": 1405091840, "train/batch_count": 670, "train/flop_count": 0, "train/total_time": 7607.930423379992, "train/update_time": 4623.874382758397, "train/lr": 0.0002513753468698824, "train/loss": 4.26851749420166, "train/global_grad_norm": 0.4350663721561432} -{"step": 1426063360, "train/token_count": 1426063360, "train/batch_count": 680, "train/flop_count": 0, "train/total_time": 7677.034741098003, "train/update_time": 4692.8429151014425, "train/lr": 0.00023773681908340283, "train/loss": 4.282830238342285, "train/global_grad_norm": 0.39358457922935486} -{"step": 1447034880, "train/token_count": 1447034880, "train/batch_count": 690, "train/flop_count": 0, "train/total_time": 7836.185687895981, "train/update_time": 4761.811451301328, "train/lr": 0.00022436236808900823, "train/loss": 4.284267902374268, "train/global_grad_norm": 0.4044873118400574} -{"step": 1468006400, "train/token_count": 1468006400, "train/batch_count": 700, "train/flop_count": 0, "train/total_time": 7905.308589838969, "train/update_time": 4830.788480303425, "train/lr": 0.00021126546082514682, "train/loss": 4.279461860656738, "train/global_grad_norm": 0.41764646768569946} -{"step": 1488977920, "train/token_count": 1488977920, "train/batch_count": 710, "train/flop_count": 0, "train/total_time": 8064.825394029962, "train/update_time": 4899.746858645405, "train/lr": 0.00019845928476725522, "train/loss": 4.275580406188965, "train/global_grad_norm": 0.34193727374076843} -{"step": 1509949440, "train/token_count": 1509949440, "train/batch_count": 720, "train/flop_count": 0, "train/total_time": 8133.938365876966, "train/update_time": 4968.716519999434, "train/lr": 0.0001859567346490913, "train/loss": 4.250948429107666, "train/global_grad_norm": 0.37154102325439453} -{"step": 1530920960, "train/token_count": 1530920960, "train/batch_count": 730, "train/flop_count": 0, "train/total_time": 8293.024653506, "train/update_time": 5037.670779642533, "train/lr": 0.00017377039947882782, "train/loss": 4.268786907196045, "train/global_grad_norm": 0.41168051958084106} -{"step": 1551892480, "train/token_count": 1551892480, "train/batch_count": 740, "train/flop_count": 0, "train/total_time": 8362.121594669996, "train/update_time": 5106.633266707533, "train/lr": 0.00016191254986299043, "train/loss": 4.253113746643066, "train/global_grad_norm": 0.358982115983963} -{"step": 1572864000, "train/token_count": 1572864000, "train/batch_count": 750, "train/flop_count": 0, "train/total_time": 8521.302722692955, "train/update_time": 5175.6192292046035, "train/lr": 0.00015039512565099468, "train/loss": 4.235835075378418, "train/global_grad_norm": 0.3571152091026306} -{"step": 1593835520, "train/token_count": 1593835520, "train/batch_count": 760, "train/flop_count": 0, "train/total_time": 8590.419437660952, "train/update_time": 5244.595247996622, "train/lr": 0.00013922972391273224, "train/loss": 4.197190761566162, "train/global_grad_norm": 0.34816187620162964} -{"step": 1614807040, "train/token_count": 1614807040, "train/batch_count": 770, "train/flop_count": 0, "train/total_time": 8749.521732740977, "train/update_time": 5313.566678232513, "train/lr": 0.00012842758726130281, "train/loss": 4.2619099617004395, "train/global_grad_norm": 0.31704217195510864} -{"step": 1635778560, "train/token_count": 1635778560, "train/batch_count": 780, "train/flop_count": 0, "train/total_time": 8818.629101625993, "train/update_time": 5382.53907933255, "train/lr": 0.00011799959253265679, "train/loss": 4.183486461639404, "train/global_grad_norm": 0.3519703447818756} -{"step": 1656750080, "train/token_count": 1656750080, "train/batch_count": 790, "train/flop_count": 0, "train/total_time": 8977.726054205967, "train/update_time": 5451.502941412618, "train/lr": 0.00010795623983354214, "train/loss": 4.21256685256958, "train/global_grad_norm": 0.3151834309101105} -{"step": 1677721600, "train/token_count": 1677721600, "train/batch_count": 800, "train/flop_count": 0, "train/total_time": 9046.829455698957, "train/update_time": 5520.4718770905165, "train/lr": 9.830764196878872e-05, "train/loss": 4.190575122833252, "train/global_grad_norm": 0.3258683383464813} -{"step": 1698693120, "train/token_count": 1698693120, "train/batch_count": 810, "train/flop_count": 0, "train/total_time": 9206.457426826004, "train/update_time": 5589.4371389435255, "train/lr": 8.906351425856951e-05, "train/loss": 4.166137218475342, "train/global_grad_norm": 0.29673388600349426} -{"step": 1719664640, "train/token_count": 1719664640, "train/batch_count": 820, "train/flop_count": 0, "train/total_time": 9275.580202061974, "train/update_time": 5658.411268384545, "train/lr": 8.02331647558977e-05, "train/loss": 4.178226470947266, "train/global_grad_norm": 0.2848501205444336} -{"step": 1740636160, "train/token_count": 1740636160, "train/batch_count": 830, "train/flop_count": 0, "train/total_time": 9434.65409148595, "train/update_time": 5727.392325014458, "train/lr": 7.182548487420554e-05, "train/loss": 4.2109150886535645, "train/global_grad_norm": 0.29233965277671814} -{"step": 1761607680, "train/token_count": 1761607680, "train/batch_count": 840, "train/flop_count": 0, "train/total_time": 9503.765275373997, "train/update_time": 5796.35883763741, "train/lr": 6.384894043444556e-05, "train/loss": 4.159761428833008, "train/global_grad_norm": 0.30078691244125366} -{"step": 1782579200, "train/token_count": 1782579200, "train/batch_count": 850, "train/flop_count": 0, "train/total_time": 9662.829677159956, "train/update_time": 5865.338317954331, "train/lr": 5.6311563140726166e-05, "train/loss": 4.228906631469727, "train/global_grad_norm": 0.2724829316139221} -{"step": 1803550720, "train/token_count": 1803550720, "train/batch_count": 860, "train/flop_count": 0, "train/total_time": 9731.945241109002, "train/update_time": 5934.31139906036, "train/lr": 4.922094249306547e-05, "train/loss": 4.2079997062683105, "train/global_grad_norm": 0.25332000851631165} -{"step": 1824522240, "train/token_count": 1824522240, "train/batch_count": 870, "train/flop_count": 0, "train/total_time": 9891.181177204999, "train/update_time": 6003.2874812923255, "train/lr": 4.2584218145409916e-05, "train/loss": 4.153715133666992, "train/global_grad_norm": 0.2573024034500122} -{"step": 1845493760, "train/token_count": 1845493760, "train/batch_count": 880, "train/flop_count": 0, "train/total_time": 9960.290040618973, "train/update_time": 6072.254921466229, "train/lr": 3.6408072716606236e-05, "train/loss": 4.171263694763184, "train/global_grad_norm": 0.27709466218948364} -{"step": 1866465280, "train/token_count": 1866465280, "train/batch_count": 890, "train/flop_count": 0, "train/total_time": 10119.394410843961, "train/update_time": 6141.227180292248, "train/lr": 3.069872506157217e-05, "train/loss": 4.226706504821777, "train/global_grad_norm": 0.25968244671821594} -{"step": 1887436800, "train/token_count": 1887436800, "train/batch_count": 900, "train/flop_count": 0, "train/total_time": 10188.515749696002, "train/update_time": 6210.206845312321, "train/lr": 2.5461924009435368e-05, "train/loss": 4.141597270965576, "train/global_grad_norm": 0.2525114119052887} -{"step": 1908408320, "train/token_count": 1908408320, "train/batch_count": 910, "train/flop_count": 0, "train/total_time": 10348.059376804973, "train/update_time": 6279.188010795449, "train/lr": 2.0702942574950812e-05, "train/loss": 4.176564693450928, "train/global_grad_norm": 0.2523636221885681} -{"step": 1929379840, "train/token_count": 1929379840, "train/batch_count": 920, "train/flop_count": 0, "train/total_time": 10417.185683044954, "train/update_time": 6348.16824121651, "train/lr": 1.642657264902142e-05, "train/loss": 4.20511531829834, "train/global_grad_norm": 0.23348693549633026} -{"step": 1950351360, "train/token_count": 1950351360, "train/batch_count": 930, "train/flop_count": 0, "train/total_time": 10576.369776681007, "train/update_time": 6417.1375977324205, "train/lr": 1.2637120173670358e-05, "train/loss": 4.189664363861084, "train/global_grad_norm": 0.22364211082458496} -{"step": 1971322880, "train/token_count": 1971322880, "train/batch_count": 940, "train/flop_count": 0, "train/total_time": 10645.485156638955, "train/update_time": 6486.112916803453, "train/lr": 9.338400806321978e-06, "train/loss": 4.146772384643555, "train/global_grad_norm": 0.22672487795352936} -{"step": 1992294400, "train/token_count": 1992294400, "train/batch_count": 950, "train/flop_count": 0, "train/total_time": 10804.645974584972, "train/update_time": 6555.09181263647, "train/lr": 6.533736077758867e-06, "train/loss": 4.168900966644287, "train/global_grad_norm": 0.22601623833179474} -{"step": 2013265920, "train/token_count": 2013265920, "train/batch_count": 960, "train/flop_count": 0, "train/total_time": 10873.77756452799, "train/update_time": 6624.0805201563635, "train/lr": 4.2259500476214406e-06, "train/loss": 4.167483329772949, "train/global_grad_norm": 0.21457543969154358} -{"step": 2034237440, "train/token_count": 2034237440, "train/batch_count": 970, "train/flop_count": 0, "train/total_time": 11032.958788217977, "train/update_time": 6693.060108309321, "train/lr": 2.417366460819359e-06, "train/loss": 4.191482067108154, "train/global_grad_norm": 0.21356524527072906} -{"step": 2055208960, "train/token_count": 2055208960, "train/batch_count": 980, "train/flop_count": 0, "train/total_time": 11102.099602014001, "train/update_time": 6762.03791546938, "train/lr": 1.1098064077174619e-06, "train/loss": 4.166873455047607, "train/global_grad_norm": 0.20960550010204315} +{"step": 20971520, "train/token_count": 20971520, "train/batch_count": 10, "train/flop_count": 0, "train/total_time": 57.12486812804127, "train/update_time": 56.932655182085, "train/lr": 0.0009000000000000001, "train/loss": 10.077424049377441, "train/global_grad_norm": 1.0569149255752563} +{"step": 41943040, "train/token_count": 41943040, "train/batch_count": 20, "train/flop_count": 0, "train/total_time": 110.51794190204237, "train/update_time": 110.19485108501976, "train/lr": 0.0009997960964140947, "train/loss": 8.169595718383789, "train/global_grad_norm": 0.6573789119720459} +{"step": 62914560, "train/token_count": 62914560, "train/batch_count": 30, "train/flop_count": 0, "train/total_time": 263.8700318510528, "train/update_time": 163.45965232816525, "train/lr": 0.0009990914580222257, "train/loss": 7.759955406188965, "train/global_grad_norm": 0.28603488206863403} +{"step": 83886080, "train/token_count": 83886080, "train/batch_count": 40, "train/flop_count": 0, "train/total_time": 317.24918574804906, "train/update_time": 216.73109497816768, "train/lr": 0.0009978842768382998, "train/loss": 7.5351409912109375, "train/global_grad_norm": 0.2373751848936081} +{"step": 104857600, "train/token_count": 104857600, "train/batch_count": 50, "train/flop_count": 0, "train/total_time": 470.2395834400086, "train/update_time": 270.0089025082416, "train/lr": 0.0009961757683914405, "train/loss": 7.356375694274902, "train/global_grad_norm": 0.25599583983421326} +{"step": 125829120, "train/token_count": 125829120, "train/batch_count": 60, "train/flop_count": 0, "train/total_time": 523.6280801940011, "train/update_time": 323.28231063415296, "train/lr": 0.00099396765300483, "train/loss": 7.169342041015625, "train/global_grad_norm": 0.21762162446975708} +{"step": 146800640, "train/token_count": 146800640, "train/batch_count": 70, "train/flop_count": 0, "train/total_time": 676.4629730410525, "train/update_time": 376.5575643811608, "train/lr": 0.0009912621540634887, "train/loss": 7.04250955581665, "train/global_grad_norm": 0.17491649091243744} +{"step": 167772160, "train/token_count": 167772160, "train/batch_count": 80, "train/flop_count": 0, "train/total_time": 729.8428020050051, "train/update_time": 429.83724463917315, "train/lr": 0.000988061995775515, "train/loss": 6.879690647125244, "train/global_grad_norm": 0.17261892557144165} +{"step": 188743680, "train/token_count": 188743680, "train/batch_count": 90, "train/flop_count": 0, "train/total_time": 884.0807184120058, "train/update_time": 483.11551754607353, "train/lr": 0.0009843704004290394, "train/loss": 6.732751369476318, "train/global_grad_norm": 0.32638832926750183} +{"step": 209715200, "train/token_count": 209715200, "train/batch_count": 100, "train/flop_count": 0, "train/total_time": 937.4650811910396, "train/update_time": 536.3882422860479, "train/lr": 0.0009801910851476522, "train/loss": 6.633055210113525, "train/global_grad_norm": 0.18298597633838654} +{"step": 230686720, "train/token_count": 230686720, "train/batch_count": 110, "train/flop_count": 0, "train/total_time": 1091.3179251340334, "train/update_time": 589.6708546730806, "train/lr": 0.0009755282581475768, "train/loss": 6.5601725578308105, "train/global_grad_norm": 0.7863500714302063} +{"step": 251658240, "train/token_count": 251658240, "train/batch_count": 120, "train/flop_count": 0, "train/total_time": 1144.6911778100184, "train/update_time": 642.9422557381331, "train/lr": 0.0009703866145003512, "train/loss": 6.427718162536621, "train/global_grad_norm": 0.2532098889350891} +{"step": 272629760, "train/token_count": 272629760, "train/batch_count": 130, "train/flop_count": 0, "train/total_time": 1298.4373718530405, "train/update_time": 696.2163101581973, "train/lr": 0.0009647713314052896, "train/loss": 6.381680488586426, "train/global_grad_norm": 0.1815725564956665} +{"step": 293601280, "train/token_count": 293601280, "train/batch_count": 140, "train/flop_count": 0, "train/total_time": 1351.811079526029, "train/update_time": 749.4968144011218, "train/lr": 0.0009586880629764817, "train/loss": 6.308362007141113, "train/global_grad_norm": 0.25452741980552673} +{"step": 314572800, "train/token_count": 314572800, "train/batch_count": 150, "train/flop_count": 0, "train/total_time": 1504.7933711430524, "train/update_time": 802.7697926640394, "train/lr": 0.0009521429345495787, "train/loss": 6.192831039428711, "train/global_grad_norm": 0.2731724679470062} +{"step": 335544320, "train/token_count": 335544320, "train/batch_count": 160, "train/flop_count": 0, "train/total_time": 1558.167400816048, "train/update_time": 856.0381595880608, "train/lr": 0.0009451425365140996, "train/loss": 6.160712242126465, "train/global_grad_norm": 0.25031647086143494} +{"step": 356515840, "train/token_count": 356515840, "train/batch_count": 170, "train/flop_count": 0, "train/total_time": 1711.111571622023, "train/update_time": 909.3183224739623, "train/lr": 0.000937693917677468, "train/loss": 6.076303005218506, "train/global_grad_norm": 0.22126874327659607} +{"step": 377487360, "train/token_count": 377487360, "train/batch_count": 180, "train/flop_count": 0, "train/total_time": 1764.4956908360473, "train/update_time": 962.5948072728934, "train/lr": 0.0009298045781674596, "train/loss": 6.05035400390625, "train/global_grad_norm": 0.20042574405670166} +{"step": 398458880, "train/token_count": 398458880, "train/batch_count": 190, "train/flop_count": 0, "train/total_time": 1917.3773277360015, "train/update_time": 1015.8676893726224, "train/lr": 0.0009214824618802108, "train/loss": 6.025995254516602, "train/global_grad_norm": 0.4825673997402191} +{"step": 419430400, "train/token_count": 419430400, "train/batch_count": 200, "train/flop_count": 0, "train/total_time": 1970.7506705410196, "train/update_time": 1069.132912081608, "train/lr": 0.000912735948481387, "train/loss": 5.939733505249023, "train/global_grad_norm": 0.201382115483284} +{"step": 440401920, "train/token_count": 440401920, "train/batch_count": 210, "train/flop_count": 0, "train/total_time": 2124.242168805038, "train/update_time": 1122.4119238386047, "train/lr": 0.0009035738449685707, "train/loss": 5.90223503112793, "train/global_grad_norm": 0.44680795073509216} +{"step": 461373440, "train/token_count": 461373440, "train/batch_count": 220, "train/flop_count": 0, "train/total_time": 2177.6055200890405, "train/update_time": 1175.6836349036312, "train/lr": 0.0008940053768033609, "train/loss": 5.879762172698975, "train/global_grad_norm": 0.24954979121685028} +{"step": 482344960, "train/token_count": 482344960, "train/batch_count": 230, "train/flop_count": 0, "train/total_time": 2330.6941529700416, "train/update_time": 1228.951100654609, "train/lr": 0.0008840401786221159, "train/loss": 5.81107234954834, "train/global_grad_norm": 0.2971765398979187} +{"step": 503316480, "train/token_count": 503316480, "train/batch_count": 240, "train/flop_count": 0, "train/total_time": 2384.0519924180117, "train/update_time": 1282.2238651026273, "train/lr": 0.0008736882845346905, "train/loss": 5.7556939125061035, "train/global_grad_norm": 0.2694259583950043} +{"step": 524288000, "train/token_count": 524288000, "train/batch_count": 250, "train/flop_count": 0, "train/total_time": 2537.098761650035, "train/update_time": 1335.4923671315191, "train/lr": 0.0008629601180209381, "train/loss": 5.76292610168457, "train/global_grad_norm": 0.3520471751689911} +{"step": 545259520, "train/token_count": 545259520, "train/batch_count": 260, "train/flop_count": 0, "train/total_time": 2590.4739345350536, "train/update_time": 1388.759745098534, "train/lr": 0.0008518664814351503, "train/loss": 5.717782974243164, "train/global_grad_norm": 0.4163813591003418} +{"step": 566231040, "train/token_count": 566231040, "train/batch_count": 270, "train/flop_count": 0, "train/total_time": 2743.4162192750373, "train/update_time": 1442.0279306704178, "train/lr": 0.0008404185451290017, "train/loss": 5.692546844482422, "train/global_grad_norm": 0.21434997022151947} +{"step": 587202560, "train/token_count": 587202560, "train/batch_count": 280, "train/flop_count": 0, "train/total_time": 2796.791148387012, "train/update_time": 1495.3029013883206, "train/lr": 0.0008286278362039527, "train/loss": 5.643004894256592, "train/global_grad_norm": 0.2754496932029724} +{"step": 608174080, "train/token_count": 608174080, "train/batch_count": 290, "train/flop_count": 0, "train/total_time": 2949.6854955510353, "train/update_time": 1548.5792963503627, "train/lr": 0.0008165062269044352, "train/loss": 5.610556125640869, "train/global_grad_norm": 0.2890426218509674} +{"step": 629145600, "train/token_count": 629145600, "train/batch_count": 300, "train/flop_count": 0, "train/total_time": 3003.058338998002, "train/update_time": 1601.8528411513544, "train/lr": 0.0008040659226635089, "train/loss": 5.58476448059082, "train/global_grad_norm": 0.380667507648468} +{"step": 650117120, "train/token_count": 650117120, "train/batch_count": 310, "train/flop_count": 0, "train/total_time": 3156.3314061540295, "train/update_time": 1655.1285377033055, "train/lr": 0.0007913194498130252, "train/loss": 5.5936055183410645, "train/global_grad_norm": 0.2591659426689148} +{"step": 671088640, "train/token_count": 671088640, "train/batch_count": 320, "train/flop_count": 0, "train/total_time": 3209.7035325120087, "train/update_time": 1708.4027871834696, "train/lr": 0.000778279642970672, "train/loss": 5.527544021606445, "train/global_grad_norm": 0.22509922087192535} +{"step": 692060160, "train/token_count": 692060160, "train/batch_count": 330, "train/flop_count": 0, "train/total_time": 3362.512193232018, "train/update_time": 1761.6806279715383, "train/lr": 0.0007649596321166025, "train/loss": 5.541203022003174, "train/global_grad_norm": 0.4492305815219879} +{"step": 713031680, "train/token_count": 713031680, "train/batch_count": 340, "train/flop_count": 0, "train/total_time": 3415.8845459170407, "train/update_time": 1814.9587236176012, "train/lr": 0.0007513728293726579, "train/loss": 5.501528263092041, "train/global_grad_norm": 0.3490087687969208} +{"step": 734003200, "train/token_count": 734003200, "train/batch_count": 350, "train/flop_count": 0, "train/total_time": 3570.6092982320115, "train/update_time": 1868.2295263125561, "train/lr": 0.0007375329154974975, "train/loss": 5.4834885597229, "train/global_grad_norm": 0.3601242005825043} +{"step": 754974720, "train/token_count": 754974720, "train/batch_count": 360, "train/flop_count": 0, "train/total_time": 3623.980893074011, "train/update_time": 1921.4990626386134, "train/lr": 0.0007234538261112341, "train/loss": 5.410107612609863, "train/global_grad_norm": 0.4656950533390045} +{"step": 775946240, "train/token_count": 775946240, "train/batch_count": 370, "train/flop_count": 0, "train/total_time": 3779.0529326410033, "train/update_time": 1974.7656041345908, "train/lr": 0.0007091497376634464, "train/loss": 5.43698263168335, "train/global_grad_norm": 0.4004105031490326} +{"step": 796917760, "train/token_count": 796917760, "train/batch_count": 380, "train/flop_count": 0, "train/total_time": 3832.4348147350247, "train/update_time": 2028.0436954226461, "train/lr": 0.0006946350531586958, "train/loss": 5.412634372711182, "train/global_grad_norm": 0.29114505648612976} +{"step": 817889280, "train/token_count": 817889280, "train/batch_count": 390, "train/flop_count": 0, "train/total_time": 3986.6730023160344, "train/update_time": 2081.339985151775, "train/lr": 0.0006799243876539214, "train/loss": 5.414259910583496, "train/global_grad_norm": 0.3105607330799103} +{"step": 838860800, "train/token_count": 838860800, "train/batch_count": 400, "train/flop_count": 0, "train/total_time": 4040.0474286440294, "train/update_time": 2134.611642122676, "train/lr": 0.0006650325535423166, "train/loss": 5.334737300872803, "train/global_grad_norm": 0.30427536368370056} +{"step": 859832320, "train/token_count": 859832320, "train/batch_count": 410, "train/flop_count": 0, "train/total_time": 4193.527929550037, "train/update_time": 2187.884143058851, "train/lr": 0.0006499745456385053, "train/loss": 5.344532489776611, "train/global_grad_norm": 0.33702728152275085} +{"step": 880803840, "train/token_count": 880803840, "train/batch_count": 420, "train/flop_count": 0, "train/total_time": 4246.9178847110015, "train/update_time": 2241.15703301772, "train/lr": 0.0006347655260800339, "train/loss": 5.3456807136535645, "train/global_grad_norm": 0.31774818897247314} +{"step": 901775360, "train/token_count": 901775360, "train/batch_count": 430, "train/flop_count": 0, "train/total_time": 4400.437362540048, "train/update_time": 2294.435530113755, "train/lr": 0.0006194208090603844, "train/loss": 5.333301544189453, "train/global_grad_norm": 0.3481753468513489} +{"step": 922746880, "train/token_count": 922746880, "train/batch_count": 440, "train/flop_count": 0, "train/total_time": 4453.811728182016, "train/update_time": 2347.708159423666, "train/lr": 0.0006039558454088796, "train/loss": 5.352600574493408, "train/global_grad_norm": 0.25959765911102295} +{"step": 943718400, "train/token_count": 943718400, "train/batch_count": 450, "train/flop_count": 0, "train/total_time": 4608.291631601052, "train/update_time": 2400.977511668694, "train/lr": 0.0005883862070330078, "train/loss": 5.296506881713867, "train/global_grad_norm": 0.3832894265651703} +{"step": 964689920, "train/token_count": 964689920, "train/batch_count": 460, "train/flop_count": 0, "train/total_time": 4661.668510478048, "train/update_time": 2454.2510100168292, "train/lr": 0.0005727275712388317, "train/loss": 5.275446891784668, "train/global_grad_norm": 0.42716965079307556} +{"step": 985661440, "train/token_count": 985661440, "train/batch_count": 470, "train/flop_count": 0, "train/total_time": 4816.396098136029, "train/update_time": 2507.5231073708273, "train/lr": 0.0005569957049452703, "train/loss": 5.291362285614014, "train/global_grad_norm": 0.382432758808136} +{"step": 1006632960, "train/token_count": 1006632960, "train/batch_count": 480, "train/flop_count": 0, "train/total_time": 4869.772359199007, "train/update_time": 2560.802637038869, "train/lr": 0.0005412064488081482, "train/loss": 5.263927936553955, "train/global_grad_norm": 0.2995292842388153} +{"step": 1027604480, "train/token_count": 1027604480, "train/batch_count": 490, "train/flop_count": 0, "train/total_time": 5024.381550224032, "train/update_time": 2614.0767399497563, "train/lr": 0.0005253757012699972, "train/loss": 5.269484043121338, "train/global_grad_norm": 0.32839062809944153} +{"step": 1048576000, "train/token_count": 1048576000, "train/batch_count": 500, "train/flop_count": 0, "train/total_time": 5077.749329403043, "train/update_time": 2667.341171991662, "train/lr": 0.0005095194025516734, "train/loss": 5.244964122772217, "train/global_grad_norm": 0.28380733728408813} +{"step": 1069547520, "train/token_count": 1069547520, "train/batch_count": 510, "train/flop_count": 0, "train/total_time": 5232.712933773, "train/update_time": 2720.6185927585466, "train/lr": 0.0004936535186019053, "train/loss": 5.242177963256836, "train/global_grad_norm": 0.3184642493724823} +{"step": 1090519040, "train/token_count": 1090519040, "train/batch_count": 520, "train/flop_count": 0, "train/total_time": 5286.093386778026, "train/update_time": 2773.886181908485, "train/lr": 0.00047779402502093696, "train/loss": 5.2353105545043945, "train/global_grad_norm": 0.39006081223487854} +{"step": 1111490560, "train/token_count": 1111490560, "train/batch_count": 530, "train/flop_count": 0, "train/total_time": 5441.220508124039, "train/update_time": 2827.1616227625636, "train/lr": 0.0004619568909744525, "train/loss": 5.193413734436035, "train/global_grad_norm": 0.33428674936294556} +{"step": 1132462080, "train/token_count": 1132462080, "train/batch_count": 540, "train/flop_count": 0, "train/total_time": 5494.618768353015, "train/update_time": 2880.4332658784697, "train/lr": 0.00044615806311398067, "train/loss": 5.208868503570557, "train/global_grad_norm": 0.331386536359787} +{"step": 1153433600, "train/token_count": 1153433600, "train/batch_count": 550, "train/flop_count": 0, "train/total_time": 5649.622147237009, "train/update_time": 2933.705731303431, "train/lr": 0.0004304134495199673, "train/loss": 5.155785083770752, "train/global_grad_norm": 0.31903383135795593} +{"step": 1174405120, "train/token_count": 1174405120, "train/batch_count": 560, "train/flop_count": 0, "train/total_time": 5703.016606429999, "train/update_time": 2986.9915024373913, "train/lr": 0.0004147389036836882, "train/loss": 5.194952011108398, "train/global_grad_norm": 0.2875197231769562} +{"step": 1195376640, "train/token_count": 1195376640, "train/batch_count": 570, "train/flop_count": 0, "train/total_time": 5856.936125319044, "train/update_time": 3040.2605143213877, "train/lr": 0.0003991502085441259, "train/loss": 5.166281223297119, "train/global_grad_norm": 0.2693222165107727} +{"step": 1216348160, "train/token_count": 1216348160, "train/batch_count": 580, "train/flop_count": 0, "train/total_time": 5910.318671693036, "train/update_time": 3093.5336661074543, "train/lr": 0.0003836630605958888, "train/loss": 5.196907043457031, "train/global_grad_norm": 0.28884732723236084} +{"step": 1237319680, "train/token_count": 1237319680, "train/batch_count": 590, "train/flop_count": 0, "train/total_time": 6063.751631618012, "train/update_time": 3146.8101018704474, "train/lr": 0.00036829305408417155, "train/loss": 5.1710686683654785, "train/global_grad_norm": 0.2542065382003784} +{"step": 1258291200, "train/token_count": 1258291200, "train/batch_count": 600, "train/flop_count": 0, "train/total_time": 6117.130997166038, "train/update_time": 3200.0810677845147, "train/lr": 0.000353055665302672, "train/loss": 5.1745829582214355, "train/global_grad_norm": 0.2485460489988327} +{"step": 1279262720, "train/token_count": 1279262720, "train/batch_count": 610, "train/flop_count": 0, "train/total_time": 6270.89233304502, "train/update_time": 3253.376109398436, "train/lr": 0.0003379662370102746, "train/loss": 5.1606316566467285, "train/global_grad_norm": 0.24228136241436005} +{"step": 1300234240, "train/token_count": 1300234240, "train/batch_count": 620, "train/flop_count": 0, "train/total_time": 6324.271360302053, "train/update_time": 3306.647373120475, "train/lr": 0.00032303996298219405, "train/loss": 5.135626316070557, "train/global_grad_norm": 0.297547847032547} +{"step": 1321205760, "train/token_count": 1321205760, "train/batch_count": 630, "train/flop_count": 0, "train/total_time": 6477.804279661039, "train/update_time": 3359.92066662648, "train/lr": 0.00030829187271113034, "train/loss": 5.145682334899902, "train/global_grad_norm": 0.2230217158794403} +{"step": 1342177280, "train/token_count": 1342177280, "train/batch_count": 640, "train/flop_count": 0, "train/total_time": 6531.185974933032, "train/update_time": 3413.19108713849, "train/lr": 0.0002937368162738445, "train/loss": 5.152654647827148, "train/global_grad_norm": 0.2998616695404053} +{"step": 1363148800, "train/token_count": 1363148800, "train/batch_count": 650, "train/flop_count": 0, "train/total_time": 6684.524030425004, "train/update_time": 3466.4617998044705, "train/lr": 0.0002793894493783894, "train/loss": 5.121702194213867, "train/global_grad_norm": 0.24779871106147766} +{"step": 1384120320, "train/token_count": 1384120320, "train/batch_count": 660, "train/flop_count": 0, "train/total_time": 6737.915407613036, "train/update_time": 3519.7321525084553, "train/lr": 0.00026526421860705474, "train/loss": 5.143693447113037, "train/global_grad_norm": 0.25348708033561707} +{"step": 1405091840, "train/token_count": 1405091840, "train/batch_count": 670, "train/flop_count": 0, "train/total_time": 6891.055202779011, "train/update_time": 3573.0133047814597, "train/lr": 0.0002513753468698824, "train/loss": 5.096343040466309, "train/global_grad_norm": 0.23598778247833252} +{"step": 1426063360, "train/token_count": 1426063360, "train/batch_count": 680, "train/flop_count": 0, "train/total_time": 6944.4543808570015, "train/update_time": 3626.2884129853337, "train/lr": 0.00023773681908340283, "train/loss": 5.104409217834473, "train/global_grad_norm": 0.25248244404792786} +{"step": 1447034880, "train/token_count": 1447034880, "train/batch_count": 690, "train/flop_count": 0, "train/total_time": 7097.769050646049, "train/update_time": 3679.5599112784257, "train/lr": 0.00022436236808900823, "train/loss": 5.114173889160156, "train/global_grad_norm": 0.1796308010816574} +{"step": 1468006400, "train/token_count": 1468006400, "train/batch_count": 700, "train/flop_count": 0, "train/total_time": 7151.144483595039, "train/update_time": 3732.833151328552, "train/lr": 0.00021126546082514682, "train/loss": 5.100642681121826, "train/global_grad_norm": 0.21071678400039673} +{"step": 1488977920, "train/token_count": 1488977920, "train/batch_count": 710, "train/flop_count": 0, "train/total_time": 7304.954160230001, "train/update_time": 3786.1120017025387, "train/lr": 0.00019845928476725522, "train/loss": 5.092477321624756, "train/global_grad_norm": 0.2642405927181244} +{"step": 1509949440, "train/token_count": 1509949440, "train/batch_count": 720, "train/flop_count": 0, "train/total_time": 7358.337214609026, "train/update_time": 3839.3859579174896, "train/lr": 0.0001859567346490913, "train/loss": 5.1006293296813965, "train/global_grad_norm": 0.2527276873588562} +{"step": 1530920960, "train/token_count": 1530920960, "train/batch_count": 730, "train/flop_count": 0, "train/total_time": 7511.621099121054, "train/update_time": 3892.6592194504337, "train/lr": 0.00017377039947882782, "train/loss": 5.108747959136963, "train/global_grad_norm": 0.18992801010608673} +{"step": 1551892480, "train/token_count": 1551892480, "train/batch_count": 740, "train/flop_count": 0, "train/total_time": 7565.03086849401, "train/update_time": 3945.9412919793394, "train/lr": 0.00016191254986299043, "train/loss": 5.105123043060303, "train/global_grad_norm": 0.19203054904937744} +{"step": 1572864000, "train/token_count": 1572864000, "train/batch_count": 750, "train/flop_count": 0, "train/total_time": 7718.234012908011, "train/update_time": 3999.225145033328, "train/lr": 0.00015039512565099468, "train/loss": 5.096941947937012, "train/global_grad_norm": 0.18585249781608582} +{"step": 1593835520, "train/token_count": 1593835520, "train/batch_count": 760, "train/flop_count": 0, "train/total_time": 7771.636799781001, "train/update_time": 4052.503600837372, "train/lr": 0.00013922972391273224, "train/loss": 5.052731990814209, "train/global_grad_norm": 0.20207689702510834} +{"step": 1614807040, "train/token_count": 1614807040, "train/batch_count": 770, "train/flop_count": 0, "train/total_time": 7924.894128009037, "train/update_time": 4105.784444000397, "train/lr": 0.00012842758726130281, "train/loss": 5.112724304199219, "train/global_grad_norm": 0.20703168213367462} +{"step": 1635778560, "train/token_count": 1635778560, "train/batch_count": 780, "train/flop_count": 0, "train/total_time": 7978.290488699044, "train/update_time": 4159.058140857378, "train/lr": 0.00011799959253265679, "train/loss": 5.0365166664123535, "train/global_grad_norm": 0.19214113056659698} +{"step": 1656750080, "train/token_count": 1656750080, "train/batch_count": 790, "train/flop_count": 0, "train/total_time": 8133.211715042999, "train/update_time": 4212.332782215439, "train/lr": 0.00010795623983354214, "train/loss": 5.071290016174316, "train/global_grad_norm": 0.18038234114646912} +{"step": 1677721600, "train/token_count": 1677721600, "train/batch_count": 800, "train/flop_count": 0, "train/total_time": 8186.5969784220215, "train/update_time": 4265.60545003548, "train/lr": 9.830764196878872e-05, "train/loss": 5.038881301879883, "train/global_grad_norm": 0.21081118285655975} +{"step": 1698693120, "train/token_count": 1698693120, "train/batch_count": 810, "train/flop_count": 0, "train/total_time": 8342.015340365004, "train/update_time": 4318.878572056361, "train/lr": 8.906351425856951e-05, "train/loss": 5.020392894744873, "train/global_grad_norm": 0.14397121965885162} +{"step": 1719664640, "train/token_count": 1719664640, "train/batch_count": 820, "train/flop_count": 0, "train/total_time": 8395.40680388402, "train/update_time": 4372.159924876352, "train/lr": 8.02331647558977e-05, "train/loss": 5.033623695373535, "train/global_grad_norm": 0.20169667899608612} +{"step": 1740636160, "train/token_count": 1740636160, "train/batch_count": 830, "train/flop_count": 0, "train/total_time": 8550.23874416505, "train/update_time": 4425.43352887634, "train/lr": 7.182548487420554e-05, "train/loss": 5.0619354248046875, "train/global_grad_norm": 0.1709950715303421} +{"step": 1761607680, "train/token_count": 1761607680, "train/batch_count": 840, "train/flop_count": 0, "train/total_time": 8603.637448858004, "train/update_time": 4478.715910169412, "train/lr": 6.384894043444556e-05, "train/loss": 5.039224147796631, "train/global_grad_norm": 0.14815856516361237} +{"step": 1782579200, "train/token_count": 1782579200, "train/batch_count": 850, "train/flop_count": 0, "train/total_time": 8756.833522661007, "train/update_time": 4531.9930339534185, "train/lr": 5.6311563140726166e-05, "train/loss": 5.098855495452881, "train/global_grad_norm": 0.14111174643039703} +{"step": 1803550720, "train/token_count": 1803550720, "train/batch_count": 860, "train/flop_count": 0, "train/total_time": 8810.230298971, "train/update_time": 4585.266904477379, "train/lr": 4.922094249306547e-05, "train/loss": 5.075422286987305, "train/global_grad_norm": 0.1590386927127838} +{"step": 1824522240, "train/token_count": 1824522240, "train/batch_count": 870, "train/flop_count": 0, "train/total_time": 8963.212115761009, "train/update_time": 4638.541050948435, "train/lr": 4.2584218145409916e-05, "train/loss": 5.018501281738281, "train/global_grad_norm": 0.14633402228355408} +{"step": 1845493760, "train/token_count": 1845493760, "train/batch_count": 880, "train/flop_count": 0, "train/total_time": 9016.607044072007, "train/update_time": 4691.8174923404, "train/lr": 3.6408072716606236e-05, "train/loss": 5.05587911605835, "train/global_grad_norm": 0.1401192992925644} +{"step": 1866465280, "train/token_count": 1866465280, "train/batch_count": 890, "train/flop_count": 0, "train/total_time": 9169.565102360037, "train/update_time": 4745.092234898475, "train/lr": 3.069872506157217e-05, "train/loss": 5.094274044036865, "train/global_grad_norm": 0.14165186882019043} +{"step": 1887436800, "train/token_count": 1887436800, "train/batch_count": 900, "train/flop_count": 0, "train/total_time": 9222.955616571999, "train/update_time": 4798.3678903255495, "train/lr": 2.5461924009435368e-05, "train/loss": 5.02672815322876, "train/global_grad_norm": 0.1492566168308258} +{"step": 1908408320, "train/token_count": 1908408320, "train/batch_count": 910, "train/flop_count": 0, "train/total_time": 9376.841395194002, "train/update_time": 4851.6424762834795, "train/lr": 2.0702942574950812e-05, "train/loss": 5.048681735992432, "train/global_grad_norm": 0.1326991319656372} +{"step": 1929379840, "train/token_count": 1929379840, "train/batch_count": 920, "train/flop_count": 0, "train/total_time": 9430.23481100105, "train/update_time": 4904.91280556639, "train/lr": 1.642657264902142e-05, "train/loss": 5.089313983917236, "train/global_grad_norm": 0.1482771933078766} +{"step": 1950351360, "train/token_count": 1950351360, "train/batch_count": 930, "train/flop_count": 0, "train/total_time": 9583.694742296997, "train/update_time": 4958.171936342376, "train/lr": 1.2637120173670358e-05, "train/loss": 5.060367584228516, "train/global_grad_norm": 0.13109560310840607} +{"step": 1971322880, "train/token_count": 1971322880, "train/batch_count": 940, "train/flop_count": 0, "train/total_time": 9637.072636537021, "train/update_time": 5011.441395019239, "train/lr": 9.338400806321978e-06, "train/loss": 5.0208845138549805, "train/global_grad_norm": 0.12541547417640686} +{"step": 1992294400, "train/token_count": 1992294400, "train/batch_count": 950, "train/flop_count": 0, "train/total_time": 9791.05162328505, "train/update_time": 5064.716082916246, "train/lr": 6.533736077758867e-06, "train/loss": 5.054846286773682, "train/global_grad_norm": 0.12279438227415085} +{"step": 2013265920, "train/token_count": 2013265920, "train/batch_count": 960, "train/flop_count": 0, "train/total_time": 9844.444105764036, "train/update_time": 5117.988080174255, "train/lr": 4.2259500476214406e-06, "train/loss": 5.061497211456299, "train/global_grad_norm": 0.11868078261613846} +{"step": 2034237440, "train/token_count": 2034237440, "train/batch_count": 970, "train/flop_count": 0, "train/total_time": 9997.457464576, "train/update_time": 5171.263283661159, "train/lr": 2.417366460819359e-06, "train/loss": 5.073247909545898, "train/global_grad_norm": 0.12584054470062256} +{"step": 2055208960, "train/token_count": 2055208960, "train/batch_count": 980, "train/flop_count": 0, "train/total_time": 10050.842236216005, "train/update_time": 5224.530718925176, "train/lr": 1.1098064077174619e-06, "train/loss": 5.0577826499938965, "train/global_grad_norm": 0.11671043187379837} diff --git a/metrics/jsonlines/train_eval.jsonl b/metrics/jsonlines/train_eval.jsonl index bdae93c26c5727316520fce1faab7e24a4612d77..2b165ae1d3f68381a10401d1186043dc07d869da 100644 --- a/metrics/jsonlines/train_eval.jsonl +++ b/metrics/jsonlines/train_eval.jsonl @@ -1,19 +1,19 @@ -{"step": 104857600, "train_eval/train_token_count": 104857600, "train_eval/train_batch_count": 50, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 529.2283039629692, "train_eval/train_update_time": 348.5810220290441, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 8.262740562529434, "train_eval/perplexity_len_2048": 3876.703904883719, "train_eval/loss_avg_len_1024": 8.26358847254669, "train_eval/perplexity_len_1024": 3879.992394933413, "train_eval/loss_avg_len_512": 8.264395577695833, "train_eval/perplexity_len_512": 3883.125220863903} -{"step": 209715200, "train_eval/train_token_count": 209715200, "train_eval/train_batch_count": 100, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 1054.376383124967, "train_eval/train_update_time": 693.2531012066174, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 6.399058190044452, "train_eval/perplexity_len_2048": 601.2784810600184, "train_eval/loss_avg_len_1024": 6.403318745188226, "train_eval/perplexity_len_1024": 603.8457262467264, "train_eval/loss_avg_len_512": 6.409655180920672, "train_eval/perplexity_len_512": 607.6841038572269} -{"step": 314572800, "train_eval/train_token_count": 314572800, "train_eval/train_batch_count": 150, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 1670.3140526569914, "train_eval/train_update_time": 1037.9416665838216, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 5.696208518195235, "train_eval/perplexity_len_2048": 297.73639610132165, "train_eval/loss_avg_len_1024": 5.702004268196542, "train_eval/perplexity_len_1024": 299.46701208382314, "train_eval/loss_avg_len_512": 5.713629163519217, "train_eval/perplexity_len_512": 302.96859810428947} -{"step": 419430400, "train_eval/train_token_count": 419430400, "train_eval/train_batch_count": 200, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 2196.1893687059637, "train_eval/train_update_time": 1382.727176492801, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 5.298926785588973, "train_eval/perplexity_len_2048": 200.12192095488965, "train_eval/loss_avg_len_1024": 5.3072640442429835, "train_eval/perplexity_len_1024": 201.79736376729673, "train_eval/loss_avg_len_512": 5.32238381281153, "train_eval/perplexity_len_512": 204.8716760832407} -{"step": 524288000, "train_eval/train_token_count": 524288000, "train_eval/train_batch_count": 250, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 2812.5629005369847, "train_eval/train_update_time": 1727.488953433698, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 5.045949606849063, "train_eval/perplexity_len_2048": 155.3917902663376, "train_eval/loss_avg_len_1024": 5.054079064009093, "train_eval/perplexity_len_1024": 156.66018988300135, "train_eval/loss_avg_len_512": 5.071243590088998, "train_eval/perplexity_len_512": 159.3723980930882} -{"step": 629145600, "train_eval/train_token_count": 629145600, "train_eval/train_batch_count": 300, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 3338.075166533992, "train_eval/train_update_time": 2072.2588375147316, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 4.879746701723743, "train_eval/perplexity_len_2048": 131.59732628792167, "train_eval/loss_avg_len_1024": 4.887840953422929, "train_eval/perplexity_len_1024": 132.66683074977718, "train_eval/loss_avg_len_512": 4.906784731852021, "train_eval/perplexity_len_512": 135.2039976855768} -{"step": 734003200, "train_eval/train_token_count": 734003200, "train_eval/train_batch_count": 350, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 3954.2480487469584, "train_eval/train_update_time": 2417.024196324637, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 4.75083749865229, "train_eval/perplexity_len_2048": 115.68112675666295, "train_eval/loss_avg_len_1024": 4.762147275177813, "train_eval/perplexity_len_1024": 116.99688086604426, "train_eval/loss_avg_len_512": 4.784011943052464, "train_eval/perplexity_len_512": 119.58314973081396} -{"step": 838860800, "train_eval/train_token_count": 838860800, "train_eval/train_batch_count": 400, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 4480.02484513697, "train_eval/train_update_time": 2761.798597707704, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 4.651610771730466, "train_eval/perplexity_len_2048": 104.75358386553785, "train_eval/loss_avg_len_1024": 4.662627696003729, "train_eval/perplexity_len_1024": 105.91402668452575, "train_eval/loss_avg_len_512": 4.68646468473402, "train_eval/perplexity_len_512": 108.46902894640158} -{"step": 943718400, "train_eval/train_token_count": 943718400, "train_eval/train_batch_count": 450, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 5096.086593801971, "train_eval/train_update_time": 3106.58031973982, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 4.568712306133184, "train_eval/perplexity_len_2048": 96.4198705230403, "train_eval/loss_avg_len_1024": 4.583387268669976, "train_eval/perplexity_len_1024": 97.84526171163627, "train_eval/loss_avg_len_512": 4.611602480377551, "train_eval/perplexity_len_512": 100.64530260218329} -{"step": 1048576000, "train_eval/train_token_count": 1048576000, "train_eval/train_batch_count": 500, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 5621.47650762595, "train_eval/train_update_time": 3451.396044731722, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 4.499271789527883, "train_eval/perplexity_len_2048": 89.95160374466079, "train_eval/loss_avg_len_1024": 4.5137719763946365, "train_eval/perplexity_len_1024": 91.26542108671948, "train_eval/loss_avg_len_512": 4.544290530496583, "train_eval/perplexity_len_512": 94.0936469499529} -{"step": 1153433600, "train_eval/train_token_count": 1153433600, "train_eval/train_batch_count": 550, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 6237.341601410008, "train_eval/train_update_time": 3796.2288036436657, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 4.433857150532786, "train_eval/perplexity_len_2048": 84.25577817268656, "train_eval/loss_avg_len_1024": 4.448194169824673, "train_eval/perplexity_len_1024": 85.4724558145983, "train_eval/loss_avg_len_512": 4.479190395291953, "train_eval/perplexity_len_512": 88.16326637857776} -{"step": 1258291200, "train_eval/train_token_count": 1258291200, "train_eval/train_batch_count": 600, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 6763.143532235001, "train_eval/train_update_time": 4141.075749416603, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 4.377449188591417, "train_eval/perplexity_len_2048": 79.63464115655327, "train_eval/loss_avg_len_1024": 4.392922887768108, "train_eval/perplexity_len_1024": 80.87646667714118, "train_eval/loss_avg_len_512": 4.428195524361727, "train_eval/perplexity_len_512": 83.7801012769142} -{"step": 1363148800, "train_eval/train_token_count": 1363148800, "train_eval/train_batch_count": 650, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 7379.584015795961, "train_eval/train_update_time": 4485.922716939414, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 4.331950723144054, "train_eval/perplexity_len_2048": 76.09257743684577, "train_eval/loss_avg_len_1024": 4.353910537918127, "train_eval/perplexity_len_1024": 77.7820385775537, "train_eval/loss_avg_len_512": 4.393551132318316, "train_eval/perplexity_len_512": 80.92729284052949} -{"step": 1468006400, "train_eval/train_token_count": 1468006400, "train_eval/train_batch_count": 700, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 7905.308589838969, "train_eval/train_update_time": 4830.788480303425, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 4.285456626634223, "train_eval/perplexity_len_2048": 72.63570654655291, "train_eval/loss_avg_len_1024": 4.308664352084252, "train_eval/perplexity_len_1024": 74.3411290261473, "train_eval/loss_avg_len_512": 4.353479764840158, "train_eval/perplexity_len_512": 77.74853938517812} -{"step": 1572864000, "train_eval/train_token_count": 1572864000, "train_eval/train_batch_count": 750, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 8521.302722692955, "train_eval/train_update_time": 5175.6192292046035, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 4.248288494520384, "train_eval/perplexity_len_2048": 69.9855291689276, "train_eval/loss_avg_len_1024": 4.272928099082565, "train_eval/perplexity_len_1024": 71.73136495915584, "train_eval/loss_avg_len_512": 4.320970644394619, "train_eval/perplexity_len_512": 75.26164514357997} -{"step": 1677721600, "train_eval/train_token_count": 1677721600, "train_eval/train_batch_count": 800, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 9046.829455698957, "train_eval/train_update_time": 5520.4718770905165, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 4.221004894153129, "train_eval/perplexity_len_2048": 68.10188510203145, "train_eval/loss_avg_len_1024": 4.248565303894974, "train_eval/perplexity_len_1024": 70.00490450100025, "train_eval/loss_avg_len_512": 4.299022679186128, "train_eval/perplexity_len_512": 73.6278005431706} -{"step": 1782579200, "train_eval/train_token_count": 1782579200, "train_eval/train_batch_count": 850, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 9662.829677159956, "train_eval/train_update_time": 5865.338317954331, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 4.193981613898331, "train_eval/perplexity_len_2048": 66.28619225565306, "train_eval/loss_avg_len_1024": 4.21717632021635, "train_eval/perplexity_len_1024": 67.84165048130349, "train_eval/loss_avg_len_512": 4.269808194869183, "train_eval/perplexity_len_512": 71.50791871807111} -{"step": 1887436800, "train_eval/train_token_count": 1887436800, "train_eval/train_batch_count": 900, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 10188.515749696002, "train_eval/train_update_time": 6210.206845312321, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 4.1873091592156015, "train_eval/perplexity_len_2048": 65.84537295065353, "train_eval/loss_avg_len_1024": 4.213890584899655, "train_eval/perplexity_len_1024": 67.61910658476074, "train_eval/loss_avg_len_512": 4.267942122658933, "train_eval/perplexity_len_512": 71.37460420407933} -{"step": 1992294400, "train_eval/train_token_count": 1992294400, "train_eval/train_batch_count": 950, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 10804.645974584972, "train_eval/train_update_time": 6555.09181263647, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 4.174864357211755, "train_eval/perplexity_len_2048": 65.03101807412418, "train_eval/loss_avg_len_1024": 4.204799810601998, "train_eval/perplexity_len_1024": 67.00718219606644, "train_eval/loss_avg_len_512": 4.25834744757838, "train_eval/perplexity_len_512": 70.69306287994196} +{"step": 104857600, "train_eval/train_token_count": 104857600, "train_eval/train_batch_count": 50, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 470.2395834400086, "train_eval/train_update_time": 270.0089025082416, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 8.520986258850897, "train_eval/perplexity_len_2048": 5019.001351094721, "train_eval/loss_avg_len_1024": 8.522497836221008, "train_eval/perplexity_len_1024": 5026.593696720888, "train_eval/loss_avg_len_512": 8.524103445280343, "train_eval/perplexity_len_512": 5034.6709237971745} +{"step": 209715200, "train_eval/train_token_count": 209715200, "train_eval/train_batch_count": 100, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 937.4650811910396, "train_eval/train_update_time": 536.3882422860479, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 6.966422270688636, "train_eval/perplexity_len_2048": 1060.4220525980056, "train_eval/loss_avg_len_1024": 6.9722344631998565, "train_eval/perplexity_len_1024": 1066.6033758299184, "train_eval/loss_avg_len_512": 6.982165958659024, "train_eval/perplexity_len_512": 1077.249118985535} +{"step": 314572800, "train_eval/train_token_count": 314572800, "train_eval/train_batch_count": 150, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 1504.7933711430524, "train_eval/train_update_time": 802.7697926640394, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 6.408694662143243, "train_eval/perplexity_len_2048": 607.100692098556, "train_eval/loss_avg_len_1024": 6.417305716021801, "train_eval/perplexity_len_1024": 612.3510419477873, "train_eval/loss_avg_len_512": 6.434781106839073, "train_eval/perplexity_len_512": 623.1461654998286} +{"step": 419430400, "train_eval/train_token_count": 419430400, "train_eval/train_batch_count": 200, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 1970.7506705410196, "train_eval/train_update_time": 1069.132912081608, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 6.078357431389886, "train_eval/perplexity_len_2048": 436.31193351807616, "train_eval/loss_avg_len_1024": 6.088771626625675, "train_eval/perplexity_len_1024": 440.87951373197876, "train_eval/loss_avg_len_512": 6.107667324735084, "train_eval/perplexity_len_512": 449.28944546444586} +{"step": 524288000, "train_eval/train_token_count": 524288000, "train_eval/train_batch_count": 250, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 2537.098761650035, "train_eval/train_update_time": 1335.4923671315191, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 5.844426090481538, "train_eval/perplexity_len_2048": 345.3043114722714, "train_eval/loss_avg_len_1024": 5.8543462534093855, "train_eval/perplexity_len_1024": 348.746833459867, "train_eval/loss_avg_len_512": 5.874322384678671, "train_eval/perplexity_len_512": 355.78349462406976} +{"step": 629145600, "train_eval/train_token_count": 629145600, "train_eval/train_batch_count": 300, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 3003.058338998002, "train_eval/train_update_time": 1601.8528411513544, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 5.678305028288451, "train_eval/perplexity_len_2048": 292.4533094997349, "train_eval/loss_avg_len_1024": 5.688276190591132, "train_eval/perplexity_len_1024": 295.38399580743766, "train_eval/loss_avg_len_512": 5.709491532435932, "train_eval/perplexity_len_512": 301.71761565218657} +{"step": 734003200, "train_eval/train_token_count": 734003200, "train_eval/train_batch_count": 350, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 3570.6092982320115, "train_eval/train_update_time": 1868.2295263125561, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 5.536149771192977, "train_eval/perplexity_len_2048": 253.69931617483144, "train_eval/loss_avg_len_1024": 5.548100212041172, "train_eval/perplexity_len_1024": 256.74932301083805, "train_eval/loss_avg_len_512": 5.5710370129648075, "train_eval/perplexity_len_512": 262.7063879626648} +{"step": 838860800, "train_eval/train_token_count": 838860800, "train_eval/train_batch_count": 400, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 4040.0474286440294, "train_eval/train_update_time": 2134.611642122676, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 5.429012276458179, "train_eval/perplexity_len_2048": 227.92400829756792, "train_eval/loss_avg_len_1024": 5.44054473506887, "train_eval/perplexity_len_1024": 230.56774759580898, "train_eval/loss_avg_len_512": 5.463790042017644, "train_eval/perplexity_len_512": 235.9901441359267} +{"step": 943718400, "train_eval/train_token_count": 943718400, "train_eval/train_batch_count": 450, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 4608.291631601052, "train_eval/train_update_time": 2400.977511668694, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 5.341378729817461, "train_eval/perplexity_len_2048": 208.8003912516811, "train_eval/loss_avg_len_1024": 5.3547878493463585, "train_eval/perplexity_len_1024": 211.61907646489718, "train_eval/loss_avg_len_512": 5.3801865651574925, "train_eval/perplexity_len_512": 217.0627679970756} +{"step": 1048576000, "train_eval/train_token_count": 1048576000, "train_eval/train_batch_count": 500, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 5077.749329403043, "train_eval/train_update_time": 2667.341171991662, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 5.274126975246473, "train_eval/perplexity_len_2048": 195.21997022888468, "train_eval/loss_avg_len_1024": 5.286933790515396, "train_eval/perplexity_len_1024": 197.73619434175876, "train_eval/loss_avg_len_512": 5.313358115418087, "train_eval/perplexity_len_512": 203.0308860337738} +{"step": 1153433600, "train_eval/train_token_count": 1153433600, "train_eval/train_batch_count": 550, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 5649.622147237009, "train_eval/train_update_time": 2933.705731303431, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 5.216920477526092, "train_eval/perplexity_len_2048": 184.36555110302936, "train_eval/loss_avg_len_1024": 5.227359136996128, "train_eval/perplexity_len_1024": 186.30016010416225, "train_eval/loss_avg_len_512": 5.251418407165692, "train_eval/perplexity_len_512": 190.83676080814456} +{"step": 1258291200, "train_eval/train_token_count": 1258291200, "train_eval/train_batch_count": 600, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 6117.130997166038, "train_eval/train_update_time": 3200.0810677845147, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 5.171960423406071, "train_eval/perplexity_len_2048": 176.26004331317833, "train_eval/loss_avg_len_1024": 5.182277223829406, "train_eval/perplexity_len_1024": 178.0878955837198, "train_eval/loss_avg_len_512": 5.207336967919982, "train_eval/perplexity_len_512": 182.60712149488162} +{"step": 1363148800, "train_eval/train_token_count": 1363148800, "train_eval/train_batch_count": 650, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 6684.524030425004, "train_eval/train_update_time": 3466.4617998044705, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 5.14503268874938, "train_eval/perplexity_len_2048": 171.57709321999303, "train_eval/loss_avg_len_1024": 5.159728246359883, "train_eval/perplexity_len_1024": 174.11713221076468, "train_eval/loss_avg_len_512": 5.184838373988023, "train_eval/perplexity_len_512": 178.54459000759974} +{"step": 1468006400, "train_eval/train_token_count": 1468006400, "train_eval/train_batch_count": 700, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 7151.144483595039, "train_eval/train_update_time": 3732.833151328552, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 5.112421451262962, "train_eval/perplexity_len_2048": 166.07200365399407, "train_eval/loss_avg_len_1024": 5.126486331900814, "train_eval/perplexity_len_1024": 168.4242900992313, "train_eval/loss_avg_len_512": 5.1545387507501434, "train_eval/perplexity_len_512": 173.21589262959722} +{"step": 1572864000, "train_eval/train_token_count": 1572864000, "train_eval/train_batch_count": 750, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 7718.234012908011, "train_eval/train_update_time": 3999.225145033328, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 5.092018521135159, "train_eval/perplexity_len_2048": 162.71798046774887, "train_eval/loss_avg_len_1024": 5.106942968778494, "train_eval/perplexity_len_1024": 165.16466876932896, "train_eval/loss_avg_len_512": 5.13422550390591, "train_eval/perplexity_len_512": 169.73281155532834} +{"step": 1677721600, "train_eval/train_token_count": 1677721600, "train_eval/train_batch_count": 800, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 8186.5969784220215, "train_eval/train_update_time": 4265.60545003548, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 5.076504996041095, "train_eval/perplexity_len_2048": 160.21313075547536, "train_eval/loss_avg_len_1024": 5.0923651870700635, "train_eval/perplexity_len_1024": 162.77439902720477, "train_eval/loss_avg_len_512": 5.119650275185922, "train_eval/perplexity_len_512": 167.27685852206267} +{"step": 1782579200, "train_eval/train_token_count": 1782579200, "train_eval/train_batch_count": 850, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 8756.833522661007, "train_eval/train_update_time": 4531.9930339534185, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 5.060602921242007, "train_eval/perplexity_len_2048": 157.68555964213598, "train_eval/loss_avg_len_1024": 5.0714183489211795, "train_eval/perplexity_len_1024": 159.40025226107093, "train_eval/loss_avg_len_512": 5.098776889980873, "train_eval/perplexity_len_512": 163.82141309995419} +{"step": 1887436800, "train_eval/train_token_count": 1887436800, "train_eval/train_batch_count": 900, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 9222.955616571999, "train_eval/train_update_time": 4798.3678903255495, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 5.060850694276305, "train_eval/perplexity_len_2048": 157.7246347123879, "train_eval/loss_avg_len_1024": 5.075187498527157, "train_eval/perplexity_len_1024": 160.00218934191147, "train_eval/loss_avg_len_512": 5.102414626430763, "train_eval/perplexity_len_512": 164.4184374759526} +{"step": 1992294400, "train_eval/train_token_count": 1992294400, "train_eval/train_batch_count": 950, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 9791.05162328505, "train_eval/train_update_time": 5064.716082916246, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 5.051834876534377, "train_eval/perplexity_len_2048": 156.3090092520558, "train_eval/loss_avg_len_1024": 5.0681735769458465, "train_eval/perplexity_len_1024": 158.88387300919587, "train_eval/loss_avg_len_512": 5.094969020260178, "train_eval/perplexity_len_512": 163.19878869005936} diff --git a/metrics/jsonlines/val.jsonl b/metrics/jsonlines/val.jsonl index a4c34259d33fb594e68bbcb0d187d11475ed4b50..6a16faf20c48ba4382e7823e4bd72e95d8b0859d 100644 --- a/metrics/jsonlines/val.jsonl +++ b/metrics/jsonlines/val.jsonl @@ -1,49 +1,49 @@ -{"step": 41943040, "val/train_token_count": 41943040, "val/train_batch_count": 20, "val/train_flop_count": 0, "val/train_total_time": 141.98572698398493, "val/train_update_time": 141.70471469813492, "val/loss": 8.017322055562005, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.0004004840157, "val/val_tokens_per_second": 455109.0859565075, "val/loss_avg_len_2048": 8.017322055562005, "val/perplexity_len_2048": 3033.044124021049, "val/loss_avg_len_1024": 8.01611577590569, "val/perplexity_len_1024": 3029.387630417841, "val/loss_avg_len_512": 8.016580909416453, "val/perplexity_len_512": 3030.7970278754246} -{"step": 83886080, "val/train_token_count": 83886080, "val/train_batch_count": 40, "val/train_flop_count": 0, "val/train_total_time": 370.1664184979745, "val/train_update_time": 279.6480940769543, "val/loss": 7.168812248389213, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.01553106895881, "val/val_tokens_per_second": 455032.587305645, "val/loss_avg_len_2048": 7.168812248389213, "val/perplexity_len_2048": 1298.301626800787, "val/loss_avg_len_1024": 7.169238402332319, "val/perplexity_len_1024": 1298.8550210655874, "val/loss_avg_len_512": 7.172547295653354, "val/perplexity_len_512": 1303.1599120545325} -{"step": 125829120, "val/train_token_count": 125829120, "val/train_batch_count": 60, "val/train_flop_count": 0, "val/train_total_time": 598.2814449759899, "val/train_update_time": 417.52457091695396, "val/loss": 6.6822778238516545, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 89.9535074569867, "val/val_tokens_per_second": 455346.3356566274, "val/loss_avg_len_2048": 6.6822778238516545, "val/perplexity_len_2048": 798.13505409571, "val/loss_avg_len_1024": 6.683723994776049, "val/perplexity_len_1024": 799.2901288211134, "val/loss_avg_len_512": 6.689726676506551, "val/perplexity_len_512": 804.102442017171} -{"step": 167772160, "val/train_token_count": 167772160, "val/train_batch_count": 80, "val/train_flop_count": 0, "val/train_total_time": 826.3311064429581, "val/train_update_time": 555.3910710238852, "val/loss": 6.252672681268258, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 89.92056709702592, "val/val_tokens_per_second": 455513.1414574312, "val/loss_avg_len_2048": 6.252672681268258, "val/perplexity_len_2048": 519.3991596320534, "val/loss_avg_len_1024": 6.255661408025445, "val/perplexity_len_1024": 520.9538238741642, "val/loss_avg_len_512": 6.264640129434224, "val/perplexity_len_512": 525.6523850962309} -{"step": 209715200, "val/train_token_count": 209715200, "val/train_batch_count": 100, "val/train_flop_count": 0, "val/train_total_time": 1054.376383124967, "val/train_update_time": 693.2531012066174, "val/loss": 5.959701583172218, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.12631553603569, "val/val_tokens_per_second": 454473.2551906301, "val/loss_avg_len_2048": 5.959701583172218, "val/perplexity_len_2048": 387.4944721112208, "val/loss_avg_len_1024": 5.963742249931395, "val/perplexity_len_1024": 389.06337571774895, "val/loss_avg_len_512": 5.974755134170968, "val/perplexity_len_512": 393.37176599339256} -{"step": 251658240, "val/train_token_count": 251658240, "val/train_batch_count": 120, "val/train_flop_count": 0, "val/train_total_time": 1283.0319452389958, "val/train_update_time": 831.1140817146515, "val/loss": 5.738160552069335, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.04604214400752, "val/val_tokens_per_second": 454878.40469983226, "val/loss_avg_len_2048": 5.738160552069335, "val/perplexity_len_2048": 310.4927501256864, "val/loss_avg_len_1024": 5.742955871155067, "val/perplexity_len_1024": 311.9852375530798, "val/loss_avg_len_512": 5.7549814947385345, "val/perplexity_len_512": 315.75970425425226} -{"step": 293601280, "val/train_token_count": 293601280, "val/train_batch_count": 140, "val/train_flop_count": 0, "val/train_total_time": 1511.2121625279542, "val/train_update_time": 969.0007961746887, "val/loss": 5.5474918597602985, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.03976762795355, "val/val_tokens_per_second": 454910.1033806272, "val/loss_avg_len_2048": 5.5474918597602985, "val/perplexity_len_2048": 256.5931764754379, "val/loss_avg_len_1024": 5.553597456831346, "val/perplexity_len_1024": 258.1646234514658, "val/loss_avg_len_512": 5.567558555799723, "val/perplexity_len_512": 261.79416253348427} -{"step": 335544320, "val/train_token_count": 335544320, "val/train_batch_count": 160, "val/train_flop_count": 0, "val/train_total_time": 1739.393865599006, "val/train_update_time": 1106.8950568859, "val/loss": 5.394616223489982, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.06370893697022, "val/val_tokens_per_second": 454789.17627815285, "val/loss_avg_len_2048": 5.394616223489982, "val/perplexity_len_2048": 220.21761652879357, "val/loss_avg_len_1024": 5.4015997172784065, "val/perplexity_len_1024": 221.7608873249335, "val/loss_avg_len_512": 5.416529589198996, "val/perplexity_len_512": 225.0965877994106} -{"step": 377487360, "val/train_token_count": 377487360, "val/train_batch_count": 180, "val/train_flop_count": 0, "val/train_total_time": 1967.6012063919916, "val/train_update_time": 1244.7901415458764, "val/loss": 5.258616862811218, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.39274938497692, "val/val_tokens_per_second": 453133.6891364371, "val/loss_avg_len_2048": 5.258616862811218, "val/perplexity_len_2048": 192.21544701876132, "val/loss_avg_len_1024": 5.26661902688928, "val/perplexity_len_1024": 193.7597572352443, "val/loss_avg_len_512": 5.283242479863857, "val/perplexity_len_512": 197.00763411971045} -{"step": 419430400, "val/train_token_count": 419430400, "val/train_batch_count": 200, "val/train_flop_count": 0, "val/train_total_time": 2196.1893687059637, "val/train_update_time": 1382.727176492801, "val/loss": 5.150326416279446, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.44550614495529, "val/val_tokens_per_second": 452869.37677538325, "val/loss_avg_len_2048": 5.150326416279446, "val/perplexity_len_2048": 172.4877839494621, "val/loss_avg_len_1024": 5.159069892106625, "val/perplexity_len_1024": 174.00253918175162, "val/loss_avg_len_512": 5.1770923202755865, "val/perplexity_len_512": 177.16691667765645} -{"step": 461373440, "val/train_token_count": 461373440, "val/train_batch_count": 220, "val/train_flop_count": 0, "val/train_total_time": 2425.217230288952, "val/train_update_time": 1520.627368493646, "val/loss": 5.060396130196401, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.14630079700146, "val/val_tokens_per_second": 454372.49934677803, "val/loss_avg_len_2048": 5.060396130196401, "val/perplexity_len_2048": 157.65295505166543, "val/loss_avg_len_1024": 5.069972552723344, "val/perplexity_len_1024": 159.16995850145472, "val/loss_avg_len_512": 5.089300132444315, "val/perplexity_len_512": 162.27625041885165} -{"step": 503316480, "val/train_token_count": 503316480, "val/train_batch_count": 240, "val/train_flop_count": 0, "val/train_total_time": 2653.526103938988, "val/train_update_time": 1658.537395758729, "val/loss": 4.9848958040447675, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 89.9463361579692, "val/val_tokens_per_second": 455382.63980050915, "val/loss_avg_len_2048": 4.9848958040447675, "val/perplexity_len_2048": 146.18834199862263, "val/loss_avg_len_1024": 4.9951241761227605, "val/perplexity_len_1024": 147.69128397585118, "val/loss_avg_len_512": 5.015488221790642, "val/perplexity_len_512": 150.72970833293965} -{"step": 545259520, "val/train_token_count": 545259520, "val/train_batch_count": 260, "val/train_flop_count": 0, "val/train_total_time": 2881.6501679039793, "val/train_update_time": 1796.4439407095779, "val/loss": 4.91457557297845, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.10853785695508, "val/val_tokens_per_second": 454562.9190546063, "val/loss_avg_len_2048": 4.91457557297845, "val/perplexity_len_2048": 136.26146448033256, "val/loss_avg_len_1024": 4.925280669786549, "val/perplexity_len_1024": 137.72799230477034, "val/loss_avg_len_512": 4.946453812780138, "val/perplexity_len_512": 140.67521768921586} -{"step": 587202560, "val/train_token_count": 587202560, "val/train_batch_count": 280, "val/train_flop_count": 0, "val/train_total_time": 3109.932256244996, "val/train_update_time": 1934.345054808713, "val/loss": 4.858456944823754, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 89.96858381299535, "val/val_tokens_per_second": 455270.03164946573, "val/loss_avg_len_2048": 4.858456944823754, "val/perplexity_len_2048": 128.8252641700353, "val/loss_avg_len_1024": 4.869785673670238, "val/perplexity_len_1024": 130.29298868563157, "val/loss_avg_len_512": 4.8919036218861125, "val/perplexity_len_512": 133.20690843461975} -{"step": 629145600, "val/train_token_count": 629145600, "val/train_batch_count": 300, "val/train_flop_count": 0, "val/train_total_time": 3338.075166533992, "val/train_update_time": 2072.2588375147316, "val/loss": 4.805626461642539, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 89.97877326997695, "val/val_tokens_per_second": 455218.47555202275, "val/loss_avg_len_2048": 4.805626461642539, "val/perplexity_len_2048": 122.1960181662709, "val/loss_avg_len_1024": 4.817329806529312, "val/perplexity_len_1024": 123.63452154215173, "val/loss_avg_len_512": 4.840296959608327, "val/perplexity_len_512": 126.50691359617247} -{"step": 671088640, "val/train_token_count": 671088640, "val/train_batch_count": 320, "val/train_flop_count": 0, "val/train_total_time": 3566.6879564279807, "val/train_update_time": 2210.160061070579, "val/loss": 4.758956886007242, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 89.99822524102638, "val/val_tokens_per_second": 455120.0858717386, "val/loss_avg_len_2048": 4.758956886007242, "val/perplexity_len_2048": 116.62421008285914, "val/loss_avg_len_1024": 4.771169604997524, "val/perplexity_len_1024": 118.05724158216387, "val/loss_avg_len_512": 4.794990886492469, "val/perplexity_len_512": 120.90327992508134} -{"step": 713031680, "val/train_token_count": 713031680, "val/train_batch_count": 340, "val/train_flop_count": 0, "val/train_total_time": 3794.8868216549745, "val/train_update_time": 2348.075175291684, "val/loss": 4.716963442835352, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.26495351700578, "val/val_tokens_per_second": 453775.22952231066, "val/loss_avg_len_2048": 4.716963442835352, "val/perplexity_len_2048": 111.82816396723561, "val/loss_avg_len_1024": 4.72980757133565, "val/perplexity_len_1024": 113.27376313238463, "val/loss_avg_len_512": 4.75441891277507, "val/perplexity_len_512": 116.0961715577927} -{"step": 754974720, "val/train_token_count": 754974720, "val/train_batch_count": 360, "val/train_flop_count": 0, "val/train_total_time": 4023.3350287670037, "val/train_update_time": 2485.9809508775943, "val/loss": 4.678447687354149, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.28331762104062, "val/val_tokens_per_second": 453682.9292420046, "val/loss_avg_len_2048": 4.678447687354149, "val/perplexity_len_2048": 107.60290950314338, "val/loss_avg_len_1024": 4.692047001610324, "val/perplexity_len_1024": 109.07623065660846, "val/loss_avg_len_512": 4.717900191090349, "val/perplexity_len_512": 111.93296788447095} -{"step": 796917760, "val/train_token_count": 796917760, "val/train_batch_count": 380, "val/train_flop_count": 0, "val/train_total_time": 4251.801609379996, "val/train_update_time": 2623.8855139956577, "val/loss": 4.638916808897607, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.0385509430198, "val/val_tokens_per_second": 454916.25055051385, "val/loss_avg_len_2048": 4.638916808897607, "val/perplexity_len_2048": 103.43224998979402, "val/loss_avg_len_1024": 4.652840973052616, "val/perplexity_len_1024": 104.88253116221422, "val/loss_avg_len_512": 4.679233000243456, "val/perplexity_len_512": 107.68744464382641} -{"step": 838860800, "val/train_token_count": 838860800, "val/train_batch_count": 400, "val/train_flop_count": 0, "val/train_total_time": 4480.02484513697, "val/train_update_time": 2761.798597707704, "val/loss": 4.607533364184201, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.03529246797552, "val/val_tokens_per_second": 454932.71446381963, "val/loss_avg_len_2048": 4.607533364184201, "val/perplexity_len_2048": 100.2365972702575, "val/loss_avg_len_1024": 4.622161411953904, "val/perplexity_len_1024": 101.71363978965609, "val/loss_avg_len_512": 4.649712271768321, "val/perplexity_len_512": 104.55489785290531} -{"step": 880803840, "val/train_token_count": 880803840, "val/train_batch_count": 420, "val/train_flop_count": 0, "val/train_total_time": 4708.671424973989, "val/train_update_time": 2899.7122086867457, "val/loss": 4.5756238960157845, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 89.96249285701197, "val/val_tokens_per_second": 455300.85593673546, "val/loss_avg_len_2048": 4.5756238960157845, "val/perplexity_len_2048": 97.08859343179769, "val/loss_avg_len_1024": 4.590941229291865, "val/perplexity_len_1024": 98.58717964711914, "val/loss_avg_len_512": 4.6193589118688365, "val/perplexity_len_512": 101.42898636242586} -{"step": 922746880, "val/train_token_count": 922746880, "val/train_batch_count": 440, "val/train_flop_count": 0, "val/train_total_time": 4936.824467270984, "val/train_update_time": 3037.629162015859, "val/loss": 4.546358786865138, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.17431224201573, "val/val_tokens_per_second": 454231.3546020608, "val/loss_avg_len_2048": 4.546358786865138, "val/perplexity_len_2048": 94.28845812477917, "val/loss_avg_len_1024": 4.562232828937704, "val/perplexity_len_1024": 95.79713983415002, "val/loss_avg_len_512": 4.591761249877047, "val/perplexity_len_512": 98.66805631960091} -{"step": 964689920, "val/train_token_count": 964689920, "val/train_batch_count": 460, "val/train_flop_count": 0, "val/train_total_time": 5165.18791128695, "val/train_update_time": 3175.5471031158813, "val/loss": 4.517468703701883, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 89.98698865802726, "val/val_tokens_per_second": 455176.91625017143, "val/loss_avg_len_2048": 4.517468703701883, "val/perplexity_len_2048": 91.60342883737046, "val/loss_avg_len_1024": 4.533819358161278, "val/perplexity_len_1024": 93.11351666917558, "val/loss_avg_len_512": 4.564258050953411, "val/perplexity_len_512": 95.99134690061845} -{"step": 1006632960, "val/train_token_count": 1006632960, "val/train_batch_count": 480, "val/train_flop_count": 0, "val/train_total_time": 5393.351182107988, "val/train_update_time": 3313.452428144694, "val/loss": 4.492371192065021, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 89.90288911398966, "val/val_tokens_per_second": 455602.71092140325, "val/loss_avg_len_2048": 4.492371192065021, "val/perplexity_len_2048": 89.33302068707025, "val/loss_avg_len_1024": 4.509291414985387, "val/perplexity_len_1024": 90.85741549263298, "val/loss_avg_len_512": 4.540741394605115, "val/perplexity_len_512": 93.76028772891796} -{"step": 1048576000, "val/train_token_count": 1048576000, "val/train_batch_count": 500, "val/train_flop_count": 0, "val/train_total_time": 5621.47650762595, "val/train_update_time": 3451.396044731722, "val/loss": 4.466965344157769, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 89.90857521002181, "val/val_tokens_per_second": 455573.8971985658, "val/loss_avg_len_2048": 4.466965344157769, "val/perplexity_len_2048": 87.0920272569869, "val/loss_avg_len_1024": 4.484603051171778, "val/perplexity_len_1024": 88.64175758666217, "val/loss_avg_len_512": 4.517253230262828, "val/perplexity_len_512": 91.58369285789568} -{"step": 1090519040, "val/train_token_count": 1090519040, "val/train_batch_count": 520, "val/train_flop_count": 0, "val/train_total_time": 5850.004313221958, "val/train_update_time": 3589.3208626466803, "val/loss": 4.443324041806534, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 89.94098925299477, "val/val_tokens_per_second": 455409.71185878024, "val/loss_avg_len_2048": 4.443324041806534, "val/perplexity_len_2048": 85.05720599339685, "val/loss_avg_len_1024": 4.461723275988643, "val/perplexity_len_1024": 86.63667943799094, "val/loss_avg_len_512": 4.495576438648999, "val/perplexity_len_512": 89.61981442325171} -{"step": 1132462080, "val/train_token_count": 1132462080, "val/train_batch_count": 540, "val/train_flop_count": 0, "val/train_total_time": 6078.168775815982, "val/train_update_time": 3727.2589657856734, "val/loss": 4.420480709320446, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.0520894320216, "val/val_tokens_per_second": 454847.8581490308, "val/loss_avg_len_2048": 4.420480709320446, "val/perplexity_len_2048": 83.1362401197537, "val/loss_avg_len_1024": 4.4397112005579284, "val/perplexity_len_1024": 84.75046225296263, "val/loss_avg_len_512": 4.474702652350906, "val/perplexity_len_512": 87.76849877254239} -{"step": 1174405120, "val/train_token_count": 1174405120, "val/train_batch_count": 560, "val/train_flop_count": 0, "val/train_total_time": 6306.462939933001, "val/train_update_time": 3865.200992291735, "val/loss": 4.397374363187957, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.00921994104283, "val/val_tokens_per_second": 455064.4925800859, "val/loss_avg_len_2048": 4.397374363187957, "val/perplexity_len_2048": 81.23728878013355, "val/loss_avg_len_1024": 4.4172204922693314, "val/perplexity_len_1024": 82.86563928056967, "val/loss_avg_len_512": 4.4534774074878545, "val/perplexity_len_512": 85.92522209509893} -{"step": 1216348160, "val/train_token_count": 1216348160, "val/train_batch_count": 580, "val/train_flop_count": 0, "val/train_total_time": 6534.709841756965, "val/train_update_time": 4003.142038117745, "val/loss": 4.375490610505734, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.2145028290106, "val/val_tokens_per_second": 454028.994402753, "val/loss_avg_len_2048": 4.375490610505734, "val/perplexity_len_2048": 79.47882313421499, "val/loss_avg_len_1024": 4.396335411125421, "val/perplexity_len_1024": 81.15293096085392, "val/loss_avg_len_512": 4.434476591604389, "val/perplexity_len_512": 84.30798583032089} -{"step": 1258291200, "val/train_token_count": 1258291200, "val/train_batch_count": 600, "val/train_flop_count": 0, "val/train_total_time": 6763.143532235001, "val/train_update_time": 4141.075749416603, "val/loss": 4.3550234247615105, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 89.9800370450248, "val/val_tokens_per_second": 455212.08198107505, "val/loss_avg_len_2048": 4.3550234247615105, "val/perplexity_len_2048": 77.86864937000485, "val/loss_avg_len_1024": 4.376727943733195, "val/perplexity_len_1024": 79.5772257888411, "val/loss_avg_len_512": 4.416302177713905, "val/perplexity_len_512": 82.78957748754466} -{"step": 1300234240, "val/train_token_count": 1300234240, "val/train_batch_count": 620, "val/train_flop_count": 0, "val/train_total_time": 6991.763516802981, "val/train_update_time": 4279.012207661464, "val/loss": 4.3350345812852265, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.01144512498286, "val/val_tokens_per_second": 455053.2428751271, "val/loss_avg_len_2048": 4.3350345812852265, "val/perplexity_len_2048": 76.32759835057868, "val/loss_avg_len_1024": 4.357916226750007, "val/perplexity_len_1024": 78.09423408238275, "val/loss_avg_len_512": 4.399376267791725, "val/perplexity_len_512": 81.40008097539523} -{"step": 1342177280, "val/train_token_count": 1342177280, "val/train_batch_count": 640, "val/train_flop_count": 0, "val/train_total_time": 7219.974368761992, "val/train_update_time": 4416.951972602401, "val/loss": 4.3158074309751395, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.51169959100662, "val/val_tokens_per_second": 452538.18219175114, "val/loss_avg_len_2048": 4.3158074309751395, "val/perplexity_len_2048": 74.87405467461461, "val/loss_avg_len_1024": 4.339516162476549, "val/perplexity_len_1024": 76.67043432955377, "val/loss_avg_len_512": 4.382426561977342, "val/perplexity_len_512": 80.03200058280369} -{"step": 1384120320, "val/train_token_count": 1384120320, "val/train_batch_count": 660, "val/train_flop_count": 0, "val/train_total_time": 7448.697525598982, "val/train_update_time": 4554.898137903423, "val/loss": 4.298591563395016, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.11004326399416, "val/val_tokens_per_second": 454555.3249819229, "val/loss_avg_len_2048": 4.298591563395016, "val/perplexity_len_2048": 73.59606527697034, "val/loss_avg_len_1024": 4.323124243415986, "val/perplexity_len_1024": 75.42390320559682, "val/loss_avg_len_512": 4.367459438038338, "val/perplexity_len_512": 78.8430713314537} -{"step": 1426063360, "val/train_token_count": 1426063360, "val/train_batch_count": 680, "val/train_flop_count": 0, "val/train_total_time": 7677.034741098003, "val/train_update_time": 4692.8429151014425, "val/loss": 4.281831383403647, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.02878875797614, "val/val_tokens_per_second": 454965.5789562217, "val/loss_avg_len_2048": 4.281831383403647, "val/perplexity_len_2048": 72.37286117017557, "val/loss_avg_len_1024": 4.307208411879139, "val/perplexity_len_1024": 74.23297154202513, "val/loss_avg_len_512": 4.353032884626277, "val/perplexity_len_512": 77.71380286337892} -{"step": 1468006400, "val/train_token_count": 1468006400, "val/train_batch_count": 700, "val/train_flop_count": 0, "val/train_total_time": 7905.308589838969, "val/train_update_time": 4830.788480303425, "val/loss": 4.26693708552008, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 89.98624766798457, "val/val_tokens_per_second": 455180.6643958197, "val/loss_avg_len_2048": 4.26693708552008, "val/perplexity_len_2048": 71.30290611174128, "val/loss_avg_len_1024": 4.293061920919362, "val/perplexity_len_1024": 73.19022844875921, "val/loss_avg_len_512": 4.340169534767326, "val/perplexity_len_512": 76.72054503555724} -{"step": 1509949440, "val/train_token_count": 1509949440, "val/train_batch_count": 720, "val/train_flop_count": 0, "val/train_total_time": 8133.938365876966, "val/train_update_time": 4968.716519999434, "val/loss": 4.252917671704688, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 89.9889751879964, "val/val_tokens_per_second": 455166.86810167873, "val/loss_avg_len_2048": 4.252917671704688, "val/perplexity_len_2048": 70.31025561202095, "val/loss_avg_len_1024": 4.2801831485737125, "val/perplexity_len_1024": 72.25367195254603, "val/loss_avg_len_512": 4.329094496766571, "val/perplexity_len_512": 75.87554989747439} -{"step": 1551892480, "val/train_token_count": 1551892480, "val/train_batch_count": 740, "val/train_flop_count": 0, "val/train_total_time": 8362.121594669996, "val/train_update_time": 5106.633266707533, "val/loss": 4.240213508293801, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.04761916899588, "val/val_tokens_per_second": 454870.4383080775, "val/loss_avg_len_2048": 4.240213508293801, "val/perplexity_len_2048": 69.42267257289618, "val/loss_avg_len_1024": 4.268120715982979, "val/perplexity_len_1024": 71.3873523702314, "val/loss_avg_len_512": 4.318212201548182, "val/perplexity_len_512": 75.05432626701605} -{"step": 1593835520, "val/train_token_count": 1593835520, "val/train_batch_count": 760, "val/train_flop_count": 0, "val/train_total_time": 8590.419437660952, "val/train_update_time": 5244.595247996622, "val/loss": 4.229185012385133, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 89.9895013620262, "val/val_tokens_per_second": 455164.2067136102, "val/loss_avg_len_2048": 4.229185012385133, "val/perplexity_len_2048": 68.66125129565661, "val/loss_avg_len_1024": 4.2580083735182885, "val/perplexity_len_1024": 70.66909675946545, "val/loss_avg_len_512": 4.309627850557212, "val/perplexity_len_512": 74.41279110805154} -{"step": 1635778560, "val/train_token_count": 1635778560, "val/train_batch_count": 780, "val/train_flop_count": 0, "val/train_total_time": 8818.629101625993, "val/train_update_time": 5382.53907933255, "val/loss": 4.219223060651217, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 89.98893610399682, "val/val_tokens_per_second": 455167.06578977744, "val/loss_avg_len_2048": 4.219223060651217, "val/perplexity_len_2048": 67.98064692681132, "val/loss_avg_len_1024": 4.248455268475786, "val/perplexity_len_1024": 69.9972019057752, "val/loss_avg_len_512": 4.300836350739189, "val/perplexity_len_512": 73.76145835958208} -{"step": 1677721600, "val/train_token_count": 1677721600, "val/train_batch_count": 800, "val/train_flop_count": 0, "val/train_total_time": 9046.829455698957, "val/train_update_time": 5520.4718770905165, "val/loss": 4.21085463935493, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.0832858220092, "val/val_tokens_per_second": 454690.3415682538, "val/loss_avg_len_2048": 4.21085463935493, "val/perplexity_len_2048": 67.4141299657516, "val/loss_avg_len_1024": 4.240732025450281, "val/perplexity_len_1024": 69.45867875377851, "val/loss_avg_len_512": 4.294237298525683, "val/perplexity_len_512": 73.27630518070133} -{"step": 1719664640, "val/train_token_count": 1719664640, "val/train_batch_count": 820, "val/train_flop_count": 0, "val/train_total_time": 9275.580202061974, "val/train_update_time": 5658.411268384545, "val/loss": 4.203232686660252, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 89.94337259500753, "val/val_tokens_per_second": 455397.644298181, "val/loss_avg_len_2048": 4.203232686660252, "val/perplexity_len_2048": 66.90225587432732, "val/loss_avg_len_1024": 4.233364781867806, "val/perplexity_len_1024": 68.94884010724084, "val/loss_avg_len_512": 4.287355207336601, "val/perplexity_len_512": 72.77374229184319} -{"step": 1761607680, "val/train_token_count": 1761607680, "val/train_batch_count": 840, "val/train_flop_count": 0, "val/train_total_time": 9503.765275373997, "val/train_update_time": 5796.35883763741, "val/loss": 4.197044236282748, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 89.93833985400852, "val/val_tokens_per_second": 455423.127294632, "val/loss_avg_len_2048": 4.197044236282748, "val/perplexity_len_2048": 66.48951302027224, "val/loss_avg_len_1024": 4.227541121233999, "val/perplexity_len_1024": 68.5484723957413, "val/loss_avg_len_512": 4.282135505998321, "val/perplexity_len_512": 72.3948747397409} -{"step": 1803550720, "val/train_token_count": 1803550720, "val/train_batch_count": 860, "val/train_flop_count": 0, "val/train_total_time": 9731.945241109002, "val/train_update_time": 5934.31139906036, "val/loss": 4.192076116132666, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.10962512000697, "val/val_tokens_per_second": 454557.434296835, "val/loss_avg_len_2048": 4.192076116132666, "val/perplexity_len_2048": 66.1600043280041, "val/loss_avg_len_1024": 4.22289962515519, "val/perplexity_len_1024": 68.23104217534896, "val/loss_avg_len_512": 4.278101034266222, "val/perplexity_len_512": 72.10338805682221} -{"step": 1845493760, "val/train_token_count": 1845493760, "val/train_batch_count": 880, "val/train_flop_count": 0, "val/train_total_time": 9960.290040618973, "val/train_update_time": 6072.254921466229, "val/loss": 4.188129460239923, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 89.98181012499845, "val/val_tokens_per_second": 455203.1120856573, "val/loss_avg_len_2048": 4.188129460239923, "val/perplexity_len_2048": 65.89940813706113, "val/loss_avg_len_1024": 4.219077997006941, "val/perplexity_len_1024": 67.97078612166737, "val/loss_avg_len_512": 4.274576161771455, "val/perplexity_len_512": 71.84968021406696} -{"step": 1887436800, "val/train_token_count": 1887436800, "val/train_batch_count": 900, "val/train_flop_count": 0, "val/train_total_time": 10188.515749696002, "val/train_update_time": 6210.206845312321, "val/loss": 4.185181051009335, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 89.98875552398385, "val/val_tokens_per_second": 455167.9791713901, "val/loss_avg_len_2048": 4.185181051009335, "val/perplexity_len_2048": 65.70539586814938, "val/loss_avg_len_1024": 4.216392289251415, "val/perplexity_len_1024": 67.78848137245132, "val/loss_avg_len_512": 4.2723089719016105, "val/perplexity_len_512": 71.68696786652427} -{"step": 1929379840, "val/train_token_count": 1929379840, "val/train_batch_count": 920, "val/train_flop_count": 0, "val/train_total_time": 10417.185683044954, "val/train_update_time": 6348.16824121651, "val/loss": 4.183063325667009, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.06988018500851, "val/val_tokens_per_second": 454758.01584132115, "val/loss_avg_len_2048": 4.183063325667009, "val/perplexity_len_2048": 65.56639711872695, "val/loss_avg_len_1024": 4.214440592087014, "val/perplexity_len_1024": 67.65630780891767, "val/loss_avg_len_512": 4.270656678032875, "val/perplexity_len_512": 71.56861773057756} -{"step": 1971322880, "val/train_token_count": 1971322880, "val/train_batch_count": 940, "val/train_flop_count": 0, "val/train_total_time": 10645.485156638955, "val/train_update_time": 6486.112916803453, "val/loss": 4.18173328043411, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.03489440103294, "val/val_tokens_per_second": 454934.7258359208, "val/loss_avg_len_2048": 4.18173328043411, "val/perplexity_len_2048": 65.47924881324224, "val/loss_avg_len_1024": 4.21317458131751, "val/perplexity_len_1024": 67.57070839093234, "val/loss_avg_len_512": 4.269501390692499, "val/perplexity_len_512": 71.48598315507563} -{"step": 2013265920, "val/train_token_count": 2013265920, "val/train_batch_count": 960, "val/train_flop_count": 0, "val/train_total_time": 10873.77756452799, "val/train_update_time": 6624.0805201563635, "val/loss": 4.1809452712302795, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.02757424901938, "val/val_tokens_per_second": 454971.7166287656, "val/loss_avg_len_2048": 4.1809452712302795, "val/perplexity_len_2048": 65.42767088712672, "val/loss_avg_len_1024": 4.212386983394856, "val/perplexity_len_1024": 67.51751079327998, "val/loss_avg_len_512": 4.268753702273779, "val/perplexity_len_512": 71.43255389008019} -{"step": 2055208960, "val/train_token_count": 2055208960, "val/train_batch_count": 980, "val/train_flop_count": 0, "val/train_total_time": 11102.099602014001, "val/train_update_time": 6762.03791546938, "val/loss": 4.18061518369345, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.24074817897053, "val/val_tokens_per_second": 453896.94596465246, "val/loss_avg_len_2048": 4.18061518369345, "val/perplexity_len_2048": 65.4060775924379, "val/loss_avg_len_1024": 4.212110854971316, "val/perplexity_len_1024": 67.49886986322706, "val/loss_avg_len_512": 4.2685156018145385, "val/perplexity_len_512": 71.4155477908546} +{"step": 41943040, "val/train_token_count": 41943040, "val/train_batch_count": 20, "val/train_flop_count": 0, "val/train_total_time": 110.51794190204237, "val/train_update_time": 110.19485108501976, "val/loss": 8.072670181155205, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 99.9716204389697, "val/val_tokens_per_second": 409716.2756805078, "val/loss_avg_len_2048": 8.072670181155205, "val/perplexity_len_2048": 3205.6500777398833, "val/loss_avg_len_1024": 8.071338223218918, "val/perplexity_len_1024": 3201.383129006829, "val/loss_avg_len_512": 8.071987850761413, "val/perplexity_len_512": 3203.463511325176} +{"step": 83886080, "val/train_token_count": 83886080, "val/train_batch_count": 40, "val/train_flop_count": 0, "val/train_total_time": 317.24918574804906, "val/train_update_time": 216.73109497816768, "val/loss": 7.519811360669136, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 99.60113897896372, "val/val_tokens_per_second": 411240.27716842643, "val/loss_avg_len_2048": 7.519811360669136, "val/perplexity_len_2048": 1844.219368930416, "val/loss_avg_len_1024": 7.520807962584495, "val/perplexity_len_1024": 1846.058237643676, "val/loss_avg_len_512": 7.525793135547638, "val/perplexity_len_512": 1855.2841344973567} +{"step": 125829120, "val/train_token_count": 125829120, "val/train_batch_count": 60, "val/train_flop_count": 0, "val/train_total_time": 523.6280801940011, "val/train_update_time": 323.28231063415296, "val/loss": 7.164581921425462, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 99.44210483605275, "val/val_tokens_per_second": 411897.9587925008, "val/loss_avg_len_2048": 7.164581921425462, "val/perplexity_len_2048": 1292.8209870442263, "val/loss_avg_len_1024": 7.166963593649864, "val/perplexity_len_1024": 1295.9037324675646, "val/loss_avg_len_512": 7.175143612968922, "val/perplexity_len_512": 1306.5477247144242} +{"step": 167772160, "val/train_token_count": 167772160, "val/train_batch_count": 80, "val/train_flop_count": 0, "val/train_total_time": 729.8428020050051, "val/train_update_time": 429.83724463917315, "val/loss": 6.865911299175769, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 100.848096723028, "val/val_tokens_per_second": 406155.409283466, "val/loss_avg_len_2048": 6.865911299175769, "val/perplexity_len_2048": 959.0193952924727, "val/loss_avg_len_1024": 6.87047398582399, "val/perplexity_len_1024": 963.4050979677779, "val/loss_avg_len_512": 6.882724252340198, "val/perplexity_len_512": 975.2796519063501} +{"step": 209715200, "val/train_token_count": 209715200, "val/train_batch_count": 100, "val/train_flop_count": 0, "val/train_total_time": 937.4650811910396, "val/train_update_time": 536.3882422860479, "val/loss": 6.6218816578798005, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 100.0134965860052, "val/val_tokens_per_second": 409544.7254439007, "val/loss_avg_len_2048": 6.6218816578798005, "val/perplexity_len_2048": 751.3575656878406, "val/loss_avg_len_1024": 6.628069533909858, "val/perplexity_len_1024": 756.0212875438285, "val/loss_avg_len_512": 6.643743345025182, "val/perplexity_len_512": 767.9643747482285} +{"step": 251658240, "val/train_token_count": 251658240, "val/train_batch_count": 120, "val/train_flop_count": 0, "val/train_total_time": 1144.6911778100184, "val/train_update_time": 642.9422557381331, "val/loss": 6.436701403411478, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 100.36018703901209, "val/val_tokens_per_second": 408129.96875023755, "val/loss_avg_len_2048": 6.436701403411478, "val/perplexity_len_2048": 624.3439406192032, "val/loss_avg_len_1024": 6.444007082933933, "val/perplexity_len_1024": 628.9218995499804, "val/loss_avg_len_512": 6.4616794860377915, "val/perplexity_len_512": 640.1352524873843} +{"step": 293601280, "val/train_token_count": 293601280, "val/train_batch_count": 140, "val/train_flop_count": 0, "val/train_total_time": 1351.811079526029, "val/train_update_time": 749.4968144011218, "val/loss": 6.28673181735389, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 99.59608248295262, "val/val_tokens_per_second": 411261.15584928676, "val/loss_avg_len_2048": 6.28673181735389, "val/perplexity_len_2048": 537.3941537499013, "val/loss_avg_len_1024": 6.294847987662257, "val/perplexity_len_1024": 541.7734838793829, "val/loss_avg_len_512": 6.313898871052266, "val/perplexity_len_512": 552.1936892494012} +{"step": 335544320, "val/train_token_count": 335544320, "val/train_batch_count": 160, "val/train_flop_count": 0, "val/train_total_time": 1558.167400816048, "val/train_update_time": 856.0381595880608, "val/loss": 6.1621321680786085, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 99.55671566096134, "val/val_tokens_per_second": 411423.7771712816, "val/loss_avg_len_2048": 6.1621321680786085, "val/perplexity_len_2048": 474.43857996407974, "val/loss_avg_len_1024": 6.1707096502751115, "val/perplexity_len_1024": 478.52557142759065, "val/loss_avg_len_512": 6.190553567818553, "val/perplexity_len_512": 488.11623688017033} +{"step": 377487360, "val/train_token_count": 377487360, "val/train_batch_count": 180, "val/train_flop_count": 0, "val/train_total_time": 1764.4956908360473, "val/train_update_time": 962.5948072728934, "val/loss": 6.043532439414506, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 99.50245448201895, "val/val_tokens_per_second": 411648.13685477345, "val/loss_avg_len_2048": 6.043532439414506, "val/perplexity_len_2048": 421.37890441996467, "val/loss_avg_len_1024": 6.052711034156941, "val/perplexity_len_1024": 425.2643749180878, "val/loss_avg_len_512": 6.073728867790103, "val/perplexity_len_512": 434.2971024683484} +{"step": 419430400, "val/train_token_count": 419430400, "val/train_batch_count": 200, "val/train_flop_count": 0, "val/train_total_time": 1970.7506705410196, "val/train_update_time": 1069.132912081608, "val/loss": 5.946526206344739, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 99.66884558799211, "val/val_tokens_per_second": 410960.9152023205, "val/loss_avg_len_2048": 5.946526206344739, "val/perplexity_len_2048": 382.4225718901975, "val/loss_avg_len_1024": 5.955733354584128, "val/perplexity_len_1024": 385.95985234406163, "val/loss_avg_len_512": 5.977101679090039, "val/perplexity_len_512": 394.2959143659733} +{"step": 461373440, "val/train_token_count": 461373440, "val/train_batch_count": 220, "val/train_flop_count": 0, "val/train_total_time": 2177.6055200890405, "val/train_update_time": 1175.6836349036312, "val/loss": 5.859628180498444, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 99.70184189802967, "val/val_tokens_per_second": 410824.9077473609, "val/loss_avg_len_2048": 5.859628180498444, "val/perplexity_len_2048": 350.59376218425683, "val/loss_avg_len_1024": 5.869486711973604, "val/perplexity_len_1024": 354.06719516588015, "val/loss_avg_len_512": 5.89188057346195, "val/perplexity_len_512": 362.0855730249886} +{"step": 503316480, "val/train_token_count": 503316480, "val/train_batch_count": 240, "val/train_flop_count": 0, "val/train_total_time": 2384.0519924180117, "val/train_update_time": 1282.2238651026273, "val/loss": 5.783013271738776, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 99.6605639460031, "val/val_tokens_per_second": 410995.0654322251, "val/loss_avg_len_2048": 5.783013271738776, "val/perplexity_len_2048": 324.7362361487591, "val/loss_avg_len_1024": 5.79357122720005, "val/perplexity_len_1024": 328.1829499750509, "val/loss_avg_len_512": 5.816968626810331, "val/perplexity_len_512": 335.95211227350995} +{"step": 545259520, "val/train_token_count": 545259520, "val/train_batch_count": 260, "val/train_flop_count": 0, "val/train_total_time": 2590.4739345350536, "val/train_update_time": 1388.759745098534, "val/loss": 5.716440834625299, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 99.56407831504475, "val/val_tokens_per_second": 411393.3528354743, "val/loss_avg_len_2048": 5.716440834625299, "val/perplexity_len_2048": 303.8216448390781, "val/loss_avg_len_1024": 5.72740313000246, "val/perplexity_len_1024": 307.17054975726995, "val/loss_avg_len_512": 5.751391485624389, "val/perplexity_len_512": 314.62815638697555} +{"step": 587202560, "val/train_token_count": 587202560, "val/train_batch_count": 280, "val/train_flop_count": 0, "val/train_total_time": 2796.791148387012, "val/train_update_time": 1495.3029013883206, "val/loss": 5.649823216465559, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 99.50496849097544, "val/val_tokens_per_second": 411637.73649870406, "val/loss_avg_len_2048": 5.649823216465559, "val/perplexity_len_2048": 284.24121221591713, "val/loss_avg_len_1024": 5.6613804133704875, "val/perplexity_len_1024": 287.5453000942534, "val/loss_avg_len_512": 5.686038017921988, "val/perplexity_len_512": 294.7236147202462} +{"step": 629145600, "val/train_token_count": 629145600, "val/train_batch_count": 300, "val/train_flop_count": 0, "val/train_total_time": 3003.058338998002, "val/train_update_time": 1601.8528411513544, "val/loss": 5.595519945517543, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 99.44042880897177, "val/val_tokens_per_second": 411904.9011613321, "val/loss_avg_len_2048": 5.595519945517543, "val/perplexity_len_2048": 269.21759218828305, "val/loss_avg_len_1024": 5.607140162670961, "val/perplexity_len_1024": 272.36420583089864, "val/loss_avg_len_512": 5.632021551703382, "val/perplexity_len_512": 279.2260172957314} +{"step": 671088640, "val/train_token_count": 671088640, "val/train_batch_count": 320, "val/train_flop_count": 0, "val/train_total_time": 3209.7035325120087, "val/train_update_time": 1708.4027871834696, "val/loss": 5.542625576345867, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 99.4260749219684, "val/val_tokens_per_second": 411964.3668136979, "val/loss_avg_len_2048": 5.542625576345867, "val/perplexity_len_2048": 255.34755458916246, "val/loss_avg_len_1024": 5.554492764933896, "val/perplexity_len_1024": 258.39586383088005, "val/loss_avg_len_512": 5.579607635878865, "val/perplexity_len_512": 264.9676215997508} +{"step": 713031680, "val/train_token_count": 713031680, "val/train_batch_count": 340, "val/train_flop_count": 0, "val/train_total_time": 3415.8845459170407, "val/train_update_time": 1814.9587236176012, "val/loss": 5.495897741303175, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 101.34644473099615, "val/val_tokens_per_second": 404158.23276998143, "val/loss_avg_len_2048": 5.495897741303175, "val/perplexity_len_2048": 243.69019874687106, "val/loss_avg_len_1024": 5.50820064414118, "val/perplexity_len_1024": 246.7068140975722, "val/loss_avg_len_512": 5.5340119562351155, "val/perplexity_len_512": 253.1575333040166} +{"step": 754974720, "val/train_token_count": 754974720, "val/train_batch_count": 360, "val/train_flop_count": 0, "val/train_total_time": 3623.980893074011, "val/train_update_time": 1921.4990626386134, "val/loss": 5.454193331815151, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 101.68043452501297, "val/val_tokens_per_second": 402830.6939416551, "val/loss_avg_len_2048": 5.454193331815151, "val/perplexity_len_2048": 233.73624740691568, "val/loss_avg_len_1024": 5.466780889385893, "val/perplexity_len_1024": 236.69701117624166, "val/loss_avg_len_512": 5.492807074442693, "val/perplexity_len_512": 242.9381962186342} +{"step": 796917760, "val/train_token_count": 796917760, "val/train_batch_count": 380, "val/train_flop_count": 0, "val/train_total_time": 3832.4348147350247, "val/train_update_time": 2028.0436954226461, "val/loss": 5.414499440166541, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 100.82433698000386, "val/val_tokens_per_second": 406251.12177155656, "val/loss_avg_len_2048": 5.414499440166541, "val/perplexity_len_2048": 224.64007173409922, "val/loss_avg_len_1024": 5.427113480050698, "val/perplexity_len_1024": 227.4916376312745, "val/loss_avg_len_512": 5.453149790130276, "val/perplexity_len_512": 233.4924611121837} +{"step": 838860800, "val/train_token_count": 838860800, "val/train_batch_count": 400, "val/train_flop_count": 0, "val/train_total_time": 4040.0474286440294, "val/train_update_time": 2134.611642122676, "val/loss": 5.380052010235644, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 99.62901382899145, "val/val_tokens_per_second": 411125.2177032077, "val/loss_avg_len_2048": 5.380052010235644, "val/perplexity_len_2048": 217.03356309816712, "val/loss_avg_len_1024": 5.393027071399452, "val/perplexity_len_1024": 219.8679351651408, "val/loss_avg_len_512": 5.4193882748374715, "val/perplexity_len_512": 225.74098881334308} +{"step": 880803840, "val/train_token_count": 880803840, "val/train_batch_count": 420, "val/train_flop_count": 0, "val/train_total_time": 4246.9178847110015, "val/train_update_time": 2241.15703301772, "val/loss": 5.349708102982907, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 100.11544794699876, "val/val_tokens_per_second": 409127.6705037995, "val/loss_avg_len_2048": 5.349708102982907, "val/perplexity_len_2048": 210.5468309042001, "val/loss_avg_len_1024": 5.362598578461824, "val/perplexity_len_1024": 213.27844776051109, "val/loss_avg_len_512": 5.388854680254846, "val/perplexity_len_512": 218.95247130422538} +{"step": 922746880, "val/train_token_count": 922746880, "val/train_batch_count": 440, "val/train_flop_count": 0, "val/train_total_time": 4453.811728182016, "val/train_update_time": 2347.708159423666, "val/loss": 5.3179078935331665, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 101.09129549999489, "val/val_tokens_per_second": 405178.3073647728, "val/loss_avg_len_2048": 5.3179078935331665, "val/perplexity_len_2048": 203.9567361248487, "val/loss_avg_len_1024": 5.331117789819115, "val/perplexity_len_1024": 206.6688574375496, "val/loss_avg_len_512": 5.357798411159706, "val/perplexity_len_512": 212.2571287414731} +{"step": 964689920, "val/train_token_count": 964689920, "val/train_batch_count": 460, "val/train_flop_count": 0, "val/train_total_time": 4661.668510478048, "val/train_update_time": 2454.2510100168292, "val/loss": 5.291644628655117, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 101.32744767802069, "val/val_tokens_per_second": 404234.0050857196, "val/loss_avg_len_2048": 5.291644628655117, "val/perplexity_len_2048": 198.66989507746274, "val/loss_avg_len_1024": 5.304906864394906, "val/perplexity_len_1024": 201.32225127081918, "val/loss_avg_len_512": 5.331710340877599, "val/perplexity_len_512": 206.79135557739693} +{"step": 1006632960, "val/train_token_count": 1006632960, "val/train_batch_count": 480, "val/train_flop_count": 0, "val/train_total_time": 4869.772359199007, "val/train_update_time": 2560.802637038869, "val/loss": 5.267185813411148, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 101.22516736399848, "val/val_tokens_per_second": 404642.45272829, "val/loss_avg_len_2048": 5.267185813411148, "val/perplexity_len_2048": 193.86960878237306, "val/loss_avg_len_1024": 5.280538284004224, "val/perplexity_len_1024": 196.47560656877008, "val/loss_avg_len_512": 5.307623009739281, "val/perplexity_len_512": 201.8698150611123} +{"step": 1048576000, "val/train_token_count": 1048576000, "val/train_batch_count": 500, "val/train_flop_count": 0, "val/train_total_time": 5077.749329403043, "val/train_update_time": 2667.341171991662, "val/loss": 5.244305751117928, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 101.12790367001435, "val/val_tokens_per_second": 405031.6333428074, "val/loss_avg_len_2048": 5.244305751117928, "val/perplexity_len_2048": 189.48422034685896, "val/loss_avg_len_1024": 5.257994546739722, "val/perplexity_len_1024": 192.095865469511, "val/loss_avg_len_512": 5.285545470022318, "val/perplexity_len_512": 197.46186360448596} +{"step": 1090519040, "val/train_token_count": 1090519040, "val/train_batch_count": 520, "val/train_flop_count": 0, "val/train_total_time": 5286.093386778026, "val/train_update_time": 2773.886181908485, "val/loss": 5.224113088942878, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 101.72626927896636, "val/val_tokens_per_second": 402649.1907186179, "val/loss_avg_len_2048": 5.224113088942878, "val/perplexity_len_2048": 185.69640127735994, "val/loss_avg_len_1024": 5.2377807637095275, "val/perplexity_len_1024": 188.25186312502768, "val/loss_avg_len_512": 5.265193086130079, "val/perplexity_len_512": 193.48366419285284} +{"step": 1132462080, "val/train_token_count": 1132462080, "val/train_batch_count": 540, "val/train_flop_count": 0, "val/train_total_time": 5494.618768353015, "val/train_update_time": 2880.4332658784697, "val/loss": 5.204351608313579, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 101.6204348889878, "val/val_tokens_per_second": 403068.53680310975, "val/loss_avg_len_2048": 5.204351608313579, "val/perplexity_len_2048": 182.0627864926695, "val/loss_avg_len_1024": 5.218173131045303, "val/perplexity_len_1024": 184.59664196761443, "val/loss_avg_len_512": 5.245799912956764, "val/perplexity_len_512": 189.76755205696549} +{"step": 1174405120, "val/train_token_count": 1174405120, "val/train_batch_count": 560, "val/train_flop_count": 0, "val/train_total_time": 5703.016606429999, "val/train_update_time": 2986.9915024373913, "val/loss": 5.187859910127178, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 100.53107214701595, "val/val_tokens_per_second": 407436.2197202113, "val/loss_avg_len_2048": 5.187859910127178, "val/perplexity_len_2048": 179.08488478780052, "val/loss_avg_len_1024": 5.201843161700579, "val/perplexity_len_1024": 181.6066640311323, "val/loss_avg_len_512": 5.229689651350794, "val/perplexity_len_512": 186.7348416205781} +{"step": 1216348160, "val/train_token_count": 1216348160, "val/train_batch_count": 580, "val/train_flop_count": 0, "val/train_total_time": 5910.318671693036, "val/train_update_time": 3093.5336661074543, "val/loss": 5.1715570674947635, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 100.04047837899998, "val/val_tokens_per_second": 409434.2676453867, "val/loss_avg_len_2048": 5.1715570674947635, "val/perplexity_len_2048": 176.18896211925417, "val/loss_avg_len_1024": 5.185615149655531, "val/perplexity_len_1024": 178.68333297978904, "val/loss_avg_len_512": 5.213619250194659, "val/perplexity_len_512": 183.75792201945603} +{"step": 1258291200, "val/train_token_count": 1258291200, "val/train_batch_count": 600, "val/train_flop_count": 0, "val/train_total_time": 6117.130997166038, "val/train_update_time": 3200.0810677845147, "val/loss": 5.157462475304946, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 99.89286627602996, "val/val_tokens_per_second": 410039.2903615046, "val/loss_avg_len_2048": 5.157462475304946, "val/perplexity_len_2048": 173.72306924924297, "val/loss_avg_len_1024": 5.171683135021537, "val/perplexity_len_1024": 176.2111752260996, "val/loss_avg_len_512": 5.199810537778202, "val/perplexity_len_512": 181.23790088671788} +{"step": 1300234240, "val/train_token_count": 1300234240, "val/train_batch_count": 620, "val/train_flop_count": 0, "val/train_total_time": 6324.271360302053, "val/train_update_time": 3306.647373120475, "val/loss": 5.143889664166529, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 100.14473170100246, "val/val_tokens_per_second": 409008.0357126763, "val/loss_avg_len_2048": 5.143889664166529, "val/perplexity_len_2048": 171.38108842507802, "val/loss_avg_len_1024": 5.15810240709968, "val/perplexity_len_1024": 173.83427574322482, "val/loss_avg_len_512": 5.18635319549219, "val/perplexity_len_512": 178.81525814719697} +{"step": 1342177280, "val/train_token_count": 1342177280, "val/train_batch_count": 640, "val/train_flop_count": 0, "val/train_total_time": 6531.185974933032, "val/train_update_time": 3413.19108713849, "val/loss": 5.131008138038125, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 99.94807949004462, "val/val_tokens_per_second": 409812.7768836203, "val/loss_avg_len_2048": 5.131008138038125, "val/perplexity_len_2048": 169.1875965491763, "val/loss_avg_len_1024": 5.145431026424136, "val/perplexity_len_1024": 171.6454524544731, "val/loss_avg_len_512": 5.173947728090605, "val/perplexity_len_512": 176.61067401256858} +{"step": 1384120320, "val/train_token_count": 1384120320, "val/train_batch_count": 660, "val/train_flop_count": 0, "val/train_total_time": 6737.915407613036, "val/train_update_time": 3519.7321525084553, "val/loss": 5.120392005993269, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 99.73520365200238, "val/val_tokens_per_second": 410687.4854631898, "val/loss_avg_len_2048": 5.120392005993269, "val/perplexity_len_2048": 167.40097894766583, "val/loss_avg_len_1024": 5.13483073173333, "val/perplexity_len_1024": 169.83556966900616, "val/loss_avg_len_512": 5.163387195022637, "val/perplexity_len_512": 174.75538481351157} +{"step": 1426063360, "val/train_token_count": 1426063360, "val/train_batch_count": 680, "val/train_flop_count": 0, "val/train_total_time": 6944.4543808570015, "val/train_update_time": 3626.2884129853337, "val/loss": 5.110702060040148, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 99.9236697970191, "val/val_tokens_per_second": 409912.8873389507, "val/loss_avg_len_2048": 5.110702060040148, "val/perplexity_len_2048": 165.78670624776979, "val/loss_avg_len_1024": 5.125157545317412, "val/perplexity_len_1024": 168.20063878756756, "val/loss_avg_len_512": 5.153955277375667, "val/perplexity_len_512": 173.11485524738933} +{"step": 1468006400, "val/train_token_count": 1468006400, "val/train_batch_count": 700, "val/train_flop_count": 0, "val/train_total_time": 7151.144483595039, "val/train_update_time": 3732.833151328552, "val/loss": 5.1014151430890955, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 99.96349205297884, "val/val_tokens_per_second": 409749.59116366145, "val/loss_avg_len_2048": 5.1014151430890955, "val/perplexity_len_2048": 164.25418608357114, "val/loss_avg_len_1024": 5.116010741177417, "val/perplexity_len_1024": 166.6691552558016, "val/loss_avg_len_512": 5.1449303319406585, "val/perplexity_len_512": 171.55953203505013} +{"step": 1509949440, "val/train_token_count": 1509949440, "val/train_batch_count": 720, "val/train_flop_count": 0, "val/train_total_time": 7358.337214609026, "val/train_update_time": 3839.3859579174896, "val/loss": 5.093367775315, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 99.89132478396641, "val/val_tokens_per_second": 410045.61796115554, "val/loss_avg_len_2048": 5.093367775315, "val/perplexity_len_2048": 162.93767656263796, "val/loss_avg_len_1024": 5.108063309450646, "val/perplexity_len_1024": 165.34981315838286, "val/loss_avg_len_512": 5.137162590507989, "val/perplexity_len_512": 170.23206433751022} +{"step": 1551892480, "val/train_token_count": 1551892480, "val/train_batch_count": 740, "val/train_flop_count": 0, "val/train_total_time": 7565.03086849401, "val/train_update_time": 3945.9412919793394, "val/loss": 5.086384535997531, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 99.77674050303176, "val/val_tokens_per_second": 410516.5171110737, "val/loss_avg_len_2048": 5.086384535997531, "val/perplexity_len_2048": 161.80380742097068, "val/loss_avg_len_1024": 5.101098106556922, "val/perplexity_len_1024": 164.20211975991967, "val/loss_avg_len_512": 5.1302048265572875, "val/perplexity_len_512": 169.05174078500627} +{"step": 1593835520, "val/train_token_count": 1593835520, "val/train_batch_count": 760, "val/train_flop_count": 0, "val/train_total_time": 7771.636799781001, "val/train_update_time": 4052.503600837372, "val/loss": 5.080432867527884, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 99.85730355099076, "val/val_tokens_per_second": 410185.31988583424, "val/loss_avg_len_2048": 5.080432867527884, "val/perplexity_len_2048": 160.84366486138146, "val/loss_avg_len_1024": 5.09520770714046, "val/perplexity_len_1024": 163.2377467490014, "val/loss_avg_len_512": 5.124402106901945, "val/perplexity_len_512": 168.07362154641356} +{"step": 1635778560, "val/train_token_count": 1635778560, "val/train_batch_count": 780, "val/train_flop_count": 0, "val/train_total_time": 7978.290488699044, "val/train_update_time": 4159.058140857378, "val/loss": 5.0751120198699065, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 101.50360889302101, "val/val_tokens_per_second": 403532.4501926773, "val/loss_avg_len_2048": 5.0751120198699065, "val/perplexity_len_2048": 159.9901130472598, "val/loss_avg_len_1024": 5.08987260026416, "val/perplexity_len_1024": 162.36917494575445, "val/loss_avg_len_512": 5.11916280782083, "val/perplexity_len_512": 167.19533638390328} +{"step": 1677721600, "val/train_token_count": 1677721600, "val/train_batch_count": 800, "val/train_flop_count": 0, "val/train_total_time": 8186.5969784220215, "val/train_update_time": 4265.60545003548, "val/loss": 5.070432775841683, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 101.58954598399578, "val/val_tokens_per_second": 403191.0921863236, "val/loss_avg_len_2048": 5.070432775841683, "val/perplexity_len_2048": 159.24322905521015, "val/loss_avg_len_1024": 5.085260728531238, "val/perplexity_len_1024": 161.6220732303632, "val/loss_avg_len_512": 5.1146308313941, "val/perplexity_len_512": 166.43932546654347} +{"step": 1719664640, "val/train_token_count": 1719664640, "val/train_batch_count": 820, "val/train_flop_count": 0, "val/train_total_time": 8395.40680388402, "val/train_update_time": 4372.159924876352, "val/loss": 5.0664675774028645, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 101.43134209199343, "val/val_tokens_per_second": 403819.95500810014, "val/loss_avg_len_2048": 5.0664675774028645, "val/perplexity_len_2048": 158.6130482735774, "val/loss_avg_len_1024": 5.0813700341027355, "val/perplexity_len_1024": 160.99447282291885, "val/loss_avg_len_512": 5.110828997220903, "val/perplexity_len_512": 165.8077520805886} +{"step": 1761607680, "val/train_token_count": 1761607680, "val/train_batch_count": 840, "val/train_flop_count": 0, "val/train_total_time": 8603.637448858004, "val/train_update_time": 4478.715910169412, "val/loss": 5.063192765613369, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 99.8011323119863, "val/val_tokens_per_second": 410416.1851786989, "val/loss_avg_len_2048": 5.063192765613369, "val/perplexity_len_2048": 158.09446997973149, "val/loss_avg_len_1024": 5.078104476989369, "val/perplexity_len_1024": 160.46959365486623, "val/loss_avg_len_512": 5.107591516341537, "val/perplexity_len_512": 165.27182065555164} +{"step": 1803550720, "val/train_token_count": 1803550720, "val/train_batch_count": 860, "val/train_flop_count": 0, "val/train_total_time": 8810.230298971, "val/train_update_time": 4585.266904477379, "val/loss": 5.060545605122825, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 99.59589945495827, "val/val_tokens_per_second": 411261.91162643145, "val/loss_avg_len_2048": 5.060545605122825, "val/perplexity_len_2048": 157.67652197681062, "val/loss_avg_len_1024": 5.075487850701, "val/perplexity_len_1024": 160.0502535650356, "val/loss_avg_len_512": 5.104997573237831, "val/perplexity_len_512": 164.84367049477257} +{"step": 1845493760, "val/train_token_count": 1845493760, "val/train_batch_count": 880, "val/train_flop_count": 0, "val/train_total_time": 9016.607044072007, "val/train_update_time": 4691.8174923404, "val/loss": 5.058485259354685, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 99.55429247097345, "val/val_tokens_per_second": 411433.79138516303, "val/loss_avg_len_2048": 5.058485259354685, "val/perplexity_len_2048": 157.35198826265747, "val/loss_avg_len_1024": 5.073454285918526, "val/perplexity_len_1024": 159.72511171651485, "val/loss_avg_len_512": 5.103029847818229, "val/perplexity_len_512": 164.51962233753557} +{"step": 1887436800, "val/train_token_count": 1887436800, "val/train_batch_count": 900, "val/train_flop_count": 0, "val/train_total_time": 9222.955616571999, "val/train_update_time": 4798.3678903255495, "val/loss": 5.056988405886034, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 100.03087017301004, "val/val_tokens_per_second": 409473.594792857, "val/loss_avg_len_2048": 5.056988405886034, "val/perplexity_len_2048": 157.1166315844018, "val/loss_avg_len_1024": 5.071977700420563, "val/perplexity_len_1024": 159.48943797193047, "val/loss_avg_len_512": 5.1015830969281035, "val/perplexity_len_512": 164.2817755215082} +{"step": 1929379840, "val/train_token_count": 1929379840, "val/train_batch_count": 920, "val/train_flop_count": 0, "val/train_total_time": 9430.23481100105, "val/train_update_time": 4904.91280556639, "val/loss": 5.055763588758994, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 100.07011089101434, "val/val_tokens_per_second": 409313.02698973974, "val/loss_avg_len_2048": 5.055763588758994, "val/perplexity_len_2048": 156.92431024637185, "val/loss_avg_len_1024": 5.0707623975899185, "val/perplexity_len_1024": 159.29572773864462, "val/loss_avg_len_512": 5.100373116077785, "val/perplexity_len_512": 164.08311792924914} +{"step": 1971322880, "val/train_token_count": 1971322880, "val/train_batch_count": 940, "val/train_flop_count": 0, "val/train_total_time": 9637.072636537021, "val/train_update_time": 5011.441395019239, "val/loss": 5.055077503644489, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 100.58326662296895, "val/val_tokens_per_second": 407224.79369790596, "val/loss_avg_len_2048": 5.055077503644489, "val/perplexity_len_2048": 156.81668373770228, "val/loss_avg_len_1024": 5.070088747809251, "val/perplexity_len_1024": 159.1884543429987, "val/loss_avg_len_512": 5.099732340347279, "val/perplexity_len_512": 163.9780111280343} +{"step": 2013265920, "val/train_token_count": 2013265920, "val/train_batch_count": 960, "val/train_flop_count": 0, "val/train_total_time": 9844.444105764036, "val/train_update_time": 5117.988080174255, "val/loss": 5.054681106794774, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 99.61741263099248, "val/val_tokens_per_second": 411173.09633132076, "val/loss_avg_len_2048": 5.054681106794774, "val/perplexity_len_2048": 156.75453441699727, "val/loss_avg_len_1024": 5.069690365717549, "val/perplexity_len_1024": 159.12504914416917, "val/loss_avg_len_512": 5.0993228254112655, "val/perplexity_len_512": 163.9108734311827} +{"step": 2055208960, "val/train_token_count": 2055208960, "val/train_batch_count": 980, "val/train_flop_count": 0, "val/train_total_time": 10050.842236216005, "val/train_update_time": 5224.530718925176, "val/loss": 5.054502099541319, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 99.76580651698168, "val/val_tokens_per_second": 410561.5082962114, "val/loss_avg_len_2048": 5.054502099541319, "val/perplexity_len_2048": 156.72647672966426, "val/loss_avg_len_1024": 5.069511245464872, "val/perplexity_len_1024": 159.09654917769598, "val/loss_avg_len_512": 5.099155388283602, "val/perplexity_len_512": 163.88343096285075} diff --git a/metrics/npz/train_eval/step-000000104857600.npz b/metrics/npz/train_eval/step-000000104857600.npz index 4b484d349da5bb9e62e3c5a48c6b1b42d1267435..da9f3826d0e36ddbf51a01de6bd8bd0ec6bbd342 100644 --- a/metrics/npz/train_eval/step-000000104857600.npz +++ b/metrics/npz/train_eval/step-000000104857600.npz @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8b243beae0017a1de2241d891675cf674d1db79105c4fe483dabc70ebadf9b5c +oid sha256:5ca2b6fd9a44859a14d30a8b8b835115a070dce0ccab094a1711a916bcdee76b size 20540 diff --git a/metrics/npz/train_eval/step-000000209715200.npz b/metrics/npz/train_eval/step-000000209715200.npz index 048e77e00139f9607f77334cd87ee6b45d0deebb..9822df250167c1037c54076bf41c226ac7758946 100644 --- a/metrics/npz/train_eval/step-000000209715200.npz +++ b/metrics/npz/train_eval/step-000000209715200.npz @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:eda57ce5fa7e74cdb9e7f0ecef0f337976b9c167dfd02431babe46d869215dfb +oid sha256:efc94aaa1ae3287e6e0ecc7f4ae4f5d4c9f8a5fbf7ba1be95807a3ec73607dd6 size 20540 diff --git a/metrics/npz/train_eval/step-000000314572800.npz b/metrics/npz/train_eval/step-000000314572800.npz index d1f0d6acb5fd87a81b3233a9c4aa9fbdace5c716..33576b85a7e4b939fb98b5fdbd80df853b2b67e1 100644 --- a/metrics/npz/train_eval/step-000000314572800.npz +++ b/metrics/npz/train_eval/step-000000314572800.npz @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:72a68d2aa605168cb796faa852fdaca040301e592ef9fac523d5a38bd6f9d9d5 +oid sha256:bf8679ea18aa02fece3c2aee47599c098b3e7ec97a05ca9b1ca46dac8b712223 size 20540 diff --git a/metrics/npz/train_eval/step-000000419430400.npz b/metrics/npz/train_eval/step-000000419430400.npz index 1162bb9061fd74ae56f9e20c6e63d1c561e586e9..f744490e5d1f113a349a5fb313ac225d77a3b47e 100644 --- a/metrics/npz/train_eval/step-000000419430400.npz +++ b/metrics/npz/train_eval/step-000000419430400.npz @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:08cefc36c5fb36c5fbb9e2d77118aa51da7c8c50ab277fe269e217ce5f068878 +oid sha256:d895609bd6b3c4f3a17d8069f4131872034adb440ed9810322ce88838c1c3990 size 20540 diff --git a/metrics/npz/train_eval/step-000000524288000.npz b/metrics/npz/train_eval/step-000000524288000.npz index 7e43d699c7b285a909725c7e490ed7ff2561c9d7..f08b488488333a83f619c67ca615389b4f9621f6 100644 --- a/metrics/npz/train_eval/step-000000524288000.npz +++ b/metrics/npz/train_eval/step-000000524288000.npz @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1d41ed5d12b8ce626320f509bbec6fb2b308fb4d50ffbff24e27cb58c85f5d34 +oid sha256:9f8c1ca16cfdc3493df8af18fdfaaae4386ecf358be656b38d39920e6bd972e0 size 20540 diff --git a/metrics/npz/train_eval/step-000000629145600.npz b/metrics/npz/train_eval/step-000000629145600.npz index a05f356449a904c14bbec3bce1d85ec5353c1562..ec17c81a8fc2794c20c882cd37cb882cc3645912 100644 --- a/metrics/npz/train_eval/step-000000629145600.npz +++ b/metrics/npz/train_eval/step-000000629145600.npz @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6cf5797d2799a6769f1e8af130bf4c75f3ea117a3c555001a4475205ae595401 +oid sha256:92cdfac9bbbe35185425c7e209695a6e67e906a583c716b17cd311222fd408d2 size 20540 diff --git a/metrics/npz/train_eval/step-000000734003200.npz b/metrics/npz/train_eval/step-000000734003200.npz index 3e8f42571788868e997fb71049b9c77cb222505b..ad29a88ae98991cb4f06e833bcd29a1f24776f1d 100644 --- a/metrics/npz/train_eval/step-000000734003200.npz +++ b/metrics/npz/train_eval/step-000000734003200.npz @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d6a82b3a7e23b06c60fc4d81b7e7b4d7ec0854601e9e233610c882cfa323d417 +oid sha256:eb65bb2177a5d1c53c6726ab856a944b566fd208c9b92e1b80dd0e03564f2eab size 20540 diff --git a/metrics/npz/train_eval/step-000000838860800.npz b/metrics/npz/train_eval/step-000000838860800.npz index df8911ff048d04cdcda065e0623b7ca711ec2f78..89b8ddfe10be5f898828542c466be5bbda5b373c 100644 --- a/metrics/npz/train_eval/step-000000838860800.npz +++ b/metrics/npz/train_eval/step-000000838860800.npz @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:05967c04c01a405ee98a3c54095fc8221fdab1569c6d496e8ed034be6f065953 +oid sha256:9aa86ff00d18908412d7f09cd7617e7a41dbd33dddb13d3e9c91630b332d1496 size 20540 diff --git a/metrics/npz/train_eval/step-000000943718400.npz b/metrics/npz/train_eval/step-000000943718400.npz index f573b493c9c7d969fb5edc69c3d4bca7be67ff5c..6cc7b2e77843410bc53e9012475db55a38d668a8 100644 --- a/metrics/npz/train_eval/step-000000943718400.npz +++ b/metrics/npz/train_eval/step-000000943718400.npz @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:dab29252c1feec76660212adf00e9dbb9ff7c15cfa8e43c21543d7c75dfa54f9 +oid sha256:c087c1312c95a2beb753576a44d0c77fcf698d9d9bb1643ae3bb43ca6d99c908 size 20540 diff --git a/metrics/npz/train_eval/step-000001048576000.npz b/metrics/npz/train_eval/step-000001048576000.npz index ec04adef533110b731412d59868b17dea597936c..ac0762f40da74100e7c0503232408996ae9e36eb 100644 --- a/metrics/npz/train_eval/step-000001048576000.npz +++ b/metrics/npz/train_eval/step-000001048576000.npz @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a16448c38926d679a325916a3128a4bd64686744d129c1edd142f31bffc7809e +oid sha256:94cee7bf23eb0b037070c3e131b17f61ff9256e5070bae1a58af07dccc4c1eb7 size 20540 diff --git a/metrics/npz/train_eval/step-000001153433600.npz b/metrics/npz/train_eval/step-000001153433600.npz index ba9ba9825d4007d34dbddc2a4431ec7c2b3525d7..175d74529c056dff267785c96a7743c00e67f548 100644 --- a/metrics/npz/train_eval/step-000001153433600.npz +++ b/metrics/npz/train_eval/step-000001153433600.npz @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c85c15e520a1e15e450c9acfa31c3d48d6e1b3f1b7e0b618c4f1fa8b5e254f40 +oid sha256:f0592dcf61d27ffb9891c62209eddcec508944613cde77346812c2cee129814e size 20540 diff --git a/metrics/npz/train_eval/step-000001258291200.npz b/metrics/npz/train_eval/step-000001258291200.npz index 74054bda0c34bf6c0746499e970df33ac6d7150b..18e7bcb158d895467de73d2b50f0e4799066ee19 100644 --- a/metrics/npz/train_eval/step-000001258291200.npz +++ b/metrics/npz/train_eval/step-000001258291200.npz @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a31d108a0abb8742d7cc9ffcb4ba4c2129ed63187aa68cf047784d028b81d6d2 +oid sha256:0d7fd6cff024296d3d4e37a5568be8739860f647d958ef0cbee26959e9c6744c size 20540 diff --git a/metrics/npz/train_eval/step-000001363148800.npz b/metrics/npz/train_eval/step-000001363148800.npz index cf142823845805d04e4d8399e7a30f7f062a5786..ff35cb555ff41f0f4932fcb98973d5625efa7e5d 100644 --- a/metrics/npz/train_eval/step-000001363148800.npz +++ b/metrics/npz/train_eval/step-000001363148800.npz @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1088e5d743ead044cc02a1242fce15999c2e09d326f1efebd609d36fffb18166 +oid sha256:7718a521eb87cfcd6dbb500a3c0b4ae5e36526ee28ab0ad519ce36d74065d1dc size 20540 diff --git a/metrics/npz/train_eval/step-000001468006400.npz b/metrics/npz/train_eval/step-000001468006400.npz index f50ca3ffae2d1913626fb79052feb5f8b1a0fbb0..0d30f572851438937e416882bf7b53e31b0ebee0 100644 --- a/metrics/npz/train_eval/step-000001468006400.npz +++ b/metrics/npz/train_eval/step-000001468006400.npz @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:58fde86d08b3ca436924d6cbfa8f61546f434e37d15643c54967356b815ddbc2 +oid sha256:0a377760799d38ceef201c02ea112f48d12c48bfd5c2ef9194a67281b3c4b598 size 20540 diff --git a/metrics/npz/train_eval/step-000001572864000.npz b/metrics/npz/train_eval/step-000001572864000.npz index 0f0a7e85de40f4b7b5852e41429c92d6c9968cae..bdfc9bc8561e6f9f8029fc682928375a6e7b7f91 100644 --- a/metrics/npz/train_eval/step-000001572864000.npz +++ b/metrics/npz/train_eval/step-000001572864000.npz @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a6da8af2424cd1c71e2140e79f20004db093563341f12adcfc564cb465b8622e +oid sha256:80a518d1389fea79c6b204dd78b21fe4643e3e61b65d2104807b653b5abca991 size 20540 diff --git a/metrics/npz/train_eval/step-000001677721600.npz b/metrics/npz/train_eval/step-000001677721600.npz index 624688fea5dfcf42fc6961a981ab417d75afeb24..d5664c8ad9fc989ba48db449aba49f71d9b90fef 100644 --- a/metrics/npz/train_eval/step-000001677721600.npz +++ b/metrics/npz/train_eval/step-000001677721600.npz @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f17bb2808d3bcb8b6ec114641a5b1c6eba99a67f4f708ed380434da29b3d10b9 +oid sha256:42b19cc212c4ae90e7d5a9f69978c1bda5a50ad2c4f18d814663d8ea45b0f899 size 20540 diff --git a/metrics/npz/train_eval/step-000001782579200.npz b/metrics/npz/train_eval/step-000001782579200.npz index aa08008a86470da66c7f47f8317130c109c6488d..2725f87b1860a4c052beba443c2dfef04638feba 100644 --- a/metrics/npz/train_eval/step-000001782579200.npz +++ b/metrics/npz/train_eval/step-000001782579200.npz @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:202b2385782126d619e374d0af8aaeb4277d105811f9e49b333c27a2d84db835 +oid sha256:edf561f29428519e6f1e81f52f81e0cad663038ec83149bc48754f8b9defa9ea size 20540 diff --git a/metrics/npz/train_eval/step-000001887436800.npz b/metrics/npz/train_eval/step-000001887436800.npz index 26a6c73baafd68329597c331a099471ad084b593..ef3f95de589d905ca6858b3d68f5dff842a814c0 100644 --- a/metrics/npz/train_eval/step-000001887436800.npz +++ b/metrics/npz/train_eval/step-000001887436800.npz @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b6e276af19f20402d4fe0215c17746aacbab8e7206c92337e8ed7f021b57dfbb +oid sha256:3939f70caf466c0a57c7e8ebbee8b34400feae0428d57911a40385f41221b52f size 20540 diff --git a/metrics/npz/train_eval/step-000001992294400.npz b/metrics/npz/train_eval/step-000001992294400.npz index 8759eb1c934e3c536cfe8919b7d18051f61fb5fe..5a829317c33bcf0046400c930f738a53291cb46a 100644 --- a/metrics/npz/train_eval/step-000001992294400.npz +++ b/metrics/npz/train_eval/step-000001992294400.npz @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0da910aa3f615144aad0eeb01fa73b24c281aa4e232854be2576d0e403b5a2d8 +oid sha256:0bd0a5bf73c23210012dde63862c968e6f053177e638cb68e029b32a1162cacc size 20540 diff --git a/metrics/npz/val/step-000000041943040.npz b/metrics/npz/val/step-000000041943040.npz index 5187eab79167988271e6fc37edf88a98f473183d..f40820e321cb9c6bcf567d580dff7528fbad3178 100644 --- a/metrics/npz/val/step-000000041943040.npz +++ b/metrics/npz/val/step-000000041943040.npz @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:08a2e31e48c5238c88ef39c6b9636efd6125fda2376e0831ab537f65a10a4202 +oid sha256:1f9ba5fd2709899a0902dc4d605b1b4b400b141e5c98b4d02e5a00bc815f29ec size 21142 diff --git a/metrics/npz/val/step-000000083886080.npz b/metrics/npz/val/step-000000083886080.npz index ee2007c8dfdbcc759995e45c77d1195953514532..686da915fde1be2971a97688754a38dcb7127780 100644 --- a/metrics/npz/val/step-000000083886080.npz +++ b/metrics/npz/val/step-000000083886080.npz @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c233fa19005aa299a07d624d2846ff23a12b36ec67c5de95d74f2e5316a9ef71 +oid sha256:8e60868b7de6855b5a73d630b2f82011a16b50d3c9b1e6e4a78a7170a1e7405e size 21142 diff --git a/metrics/npz/val/step-000000125829120.npz b/metrics/npz/val/step-000000125829120.npz index bef7b52d01be57ca2e8727bf42a504583a307628..4d6befb26923a9504cc52f69b10bd4f30f94a538 100644 --- a/metrics/npz/val/step-000000125829120.npz +++ b/metrics/npz/val/step-000000125829120.npz @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:246ae308bf0f7a198dbc1c5adf005707cf01d4768398e0fe6fb0ea5f6f1c6ca2 +oid sha256:c2b10b365ab893feeadacb3cefb6946b4e7714b116522d65e756a83c378920f3 size 21142 diff --git a/metrics/npz/val/step-000000167772160.npz b/metrics/npz/val/step-000000167772160.npz index 7caf686f380a1063970f3db377875ba3ab7ea4a3..816d6616d0853042203b5680e221ee75f9d6f421 100644 --- a/metrics/npz/val/step-000000167772160.npz +++ b/metrics/npz/val/step-000000167772160.npz @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:30f2894ae58d5b0ce83ad73359c6c6d5b008ec6c51f980f1d00a02e4bc94ce47 +oid sha256:8cdfdd1716cb177612d3a62862afba6f66eca558bd607f1a67affa956a315f00 size 21142 diff --git a/metrics/npz/val/step-000000209715200.npz b/metrics/npz/val/step-000000209715200.npz index 5c84ed40927bd0e37d21965eb2bc51c8517d9e04..e949da9bf2120430a408869d4113b3b2656679e1 100644 --- a/metrics/npz/val/step-000000209715200.npz +++ b/metrics/npz/val/step-000000209715200.npz @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5ccb9ff8cbdb27229cebf69fb4ffdf211a7c42d0f783d93fdfe00360bc3136bd +oid sha256:21905eca67d4de40c9080cb2c178d46bc017660a757e9f25c657a69d443ba3ec size 21142 diff --git a/metrics/npz/val/step-000000251658240.npz b/metrics/npz/val/step-000000251658240.npz index d5ec01b376fff11ee1f92925d94a7b8436718f2a..a464d2747a5978f523a1d11349a1e92e1d8945f1 100644 --- a/metrics/npz/val/step-000000251658240.npz +++ b/metrics/npz/val/step-000000251658240.npz @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:56317fa4c1c78b24b52c2eb45469aea42c16c0563abb898dedd74817a800e6a2 +oid sha256:352b137c1b691c16a923255fa0b63b761bd51ac169370d8da45706bbb7a04c4a size 21142 diff --git a/metrics/npz/val/step-000000293601280.npz b/metrics/npz/val/step-000000293601280.npz index 8329bedbc2588f21cedc94f226f29f28d6b4d9ca..134706cd1eabde2363667291f7a9940bbc5e71c4 100644 --- a/metrics/npz/val/step-000000293601280.npz +++ b/metrics/npz/val/step-000000293601280.npz @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d4b3bbed2a5736bbffb524e8f9fd5d4f87c8e766836a1e853c658ca673bf345d +oid sha256:cf3df61f67c3b510a24a76da42144e41631b703d6f3cf777d21b58e3d925f29b size 21142 diff --git a/metrics/npz/val/step-000000335544320.npz b/metrics/npz/val/step-000000335544320.npz index 8e1300ec2dcf73c9fce9178c9502a5e885e7c44a..13bd39603b8e70f3a0dc200ceb0abd158536d863 100644 --- a/metrics/npz/val/step-000000335544320.npz +++ b/metrics/npz/val/step-000000335544320.npz @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:06ede7b5b0cff99af24483bc0b081bd9505ca27dc0f51f71be99e0057b895882 +oid sha256:6ae4ee9a3e3af0b555a0c72a2183a19a98ad35f2da4d7e1e5efc7c24cc10e50e size 21142 diff --git a/metrics/npz/val/step-000000377487360.npz b/metrics/npz/val/step-000000377487360.npz index b62f784de00d2a7fbf652277b0de5a12cdebb8a7..11973598fe98f321c7364c571920a525ef9eacee 100644 --- a/metrics/npz/val/step-000000377487360.npz +++ b/metrics/npz/val/step-000000377487360.npz @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:db98329768e137719a0009cf0aa6664cf79b6fcda5d0dc413f2525ae3b449ffe +oid sha256:17d945dd8424f0cd27d4b6d0d068c74c6c5b58dd33fc856fc2bdd92e38e5c7cf size 21142 diff --git a/metrics/npz/val/step-000000419430400.npz b/metrics/npz/val/step-000000419430400.npz index f2c81fe6cce19cf012631e3a0a4d1357ba93dbfe..d22a056d019a1c9b6a26a1528e459a9c7f0f97bd 100644 --- a/metrics/npz/val/step-000000419430400.npz +++ b/metrics/npz/val/step-000000419430400.npz @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9224b39de02f6414626fa2bfb6356d84188629c321a9dce602579b8cc8023c05 +oid sha256:da308e3e186476c46574c1dd01a3562d86f10052ab2ca4e917939c116c891953 size 21142 diff --git a/metrics/npz/val/step-000000461373440.npz b/metrics/npz/val/step-000000461373440.npz index 37242e3e2c4c68cf3e4f617fab35f3493b550c73..3a3a9553ffc3b20539f3f1a5ccc655d3f32d218e 100644 --- a/metrics/npz/val/step-000000461373440.npz +++ b/metrics/npz/val/step-000000461373440.npz @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3a2c843622abec59d425d5ac0aef10b366f24baad31dd808ba6357d3bb8968aa +oid sha256:0c7512604a0d29b4dd95fdd2c4df34b03a293d3f1bdb0ce93fcff6a68887799a size 21142 diff --git a/metrics/npz/val/step-000000503316480.npz b/metrics/npz/val/step-000000503316480.npz index ac1e7719b68cb32ac4e60201b3d2648ddd69cdcf..5b4d58efdf1020d167b2d1b4ea77662bdfa4bbed 100644 --- a/metrics/npz/val/step-000000503316480.npz +++ b/metrics/npz/val/step-000000503316480.npz @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c2ab8e82cb4c8744493b1e554d2d9cb821406417f8cb4022daa85d61b6ca11e6 +oid sha256:e2b21b16830d02347033e6ef4c3d740739cca8b48c7f787db45dd0e882dc6431 size 21142 diff --git a/metrics/npz/val/step-000000545259520.npz b/metrics/npz/val/step-000000545259520.npz index be9eb273120d9bbe06e7712ff2faf59887c168a2..b0ef64d4c4b1610808ce3b29151f5a4ff3bf7663 100644 --- a/metrics/npz/val/step-000000545259520.npz +++ b/metrics/npz/val/step-000000545259520.npz @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e2756e5e38fb9671f38dcf11736f2eb8c55f2a99bd6841245880faa09c1635e0 +oid sha256:08d897ff92539efb1ad453a829a1f7fdecbc91261a0ddff6c24e31e4235b3d1c size 21142 diff --git a/metrics/npz/val/step-000000587202560.npz b/metrics/npz/val/step-000000587202560.npz index d4feb4c50ead39efcc225a4062004d84cd866148..a33da6a9d3dd34866df35db9328ee30e765f55fe 100644 --- a/metrics/npz/val/step-000000587202560.npz +++ b/metrics/npz/val/step-000000587202560.npz @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:626a3c1cf6c7fea3d57f0ea66f52818bd728afcb9c7b6e05fbb1ebd23911f510 +oid sha256:0a7aefbb897e47126f6e80f40b387617fb522f349fd088fd3751db6f67699f06 size 21142 diff --git a/metrics/npz/val/step-000000629145600.npz b/metrics/npz/val/step-000000629145600.npz index 52863947fb1935de3c3e5d27ba13f9de583c2138..1f29a4072d514fab45ec142da8361f8f59965a78 100644 --- a/metrics/npz/val/step-000000629145600.npz +++ b/metrics/npz/val/step-000000629145600.npz @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f10c9e94d2634f3287f6c1cff365482bf6b8ca80a1103fb9fcae6d05b038ca3e +oid sha256:3889b01c3e3aa3ba113bcba9e9d51f52438b227cd17bb623ecac782653006d1d size 21142 diff --git a/metrics/npz/val/step-000000671088640.npz b/metrics/npz/val/step-000000671088640.npz index 085df2995f2c0fe0d78f6acc476beacf7a0e4af3..df5ade679fbe634132656a5421df6e85def12bb3 100644 --- a/metrics/npz/val/step-000000671088640.npz +++ b/metrics/npz/val/step-000000671088640.npz @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:61395ddb1b476139ccc92fc31ccef97c9ec320b6ac23d5f792c92d3a24adb94d +oid sha256:951953cbc0f6dac54c5c822d381dce8abfb2e8d9d7288b9cb2d9e0b947249e72 size 21142 diff --git a/metrics/npz/val/step-000000713031680.npz b/metrics/npz/val/step-000000713031680.npz index 7b3885bbce689efe0c63fe594e079a1cb480b8d5..f0a2770608e53dde080b52d1a4b6dd909cde622e 100644 --- a/metrics/npz/val/step-000000713031680.npz +++ b/metrics/npz/val/step-000000713031680.npz @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:66c70108c6dfc4d774b8889b8e4c2845d9adf6a9dd6a3d9af4e0a4c0a804490a +oid sha256:f01941f1c4938151b3bc3098a7f36f8560145ff2573deb7a7fd2d2756cd65bec size 21142 diff --git a/metrics/npz/val/step-000000754974720.npz b/metrics/npz/val/step-000000754974720.npz index 8bee6524e3c828e3bd2898acc68fbc5c1bcf244e..b7d6ca393da4d64bfd35c5cecb20d822203f8a2e 100644 --- a/metrics/npz/val/step-000000754974720.npz +++ b/metrics/npz/val/step-000000754974720.npz @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6e2c3d5754d3d3ff14a312187735a88c44792e6e06f3d93806e65fd6014d8457 +oid sha256:920f9bf1afb87c8a76fe5b2b70d250432fd38eb3b34fc0749662bc48ff157904 size 21142 diff --git a/metrics/npz/val/step-000000796917760.npz b/metrics/npz/val/step-000000796917760.npz index dd30e4f1d9ef0024d2de5cbf1a1e4e7fce99a61f..149c05fff6724fb3c32fa0ead6b6e6aeff69a993 100644 --- a/metrics/npz/val/step-000000796917760.npz +++ b/metrics/npz/val/step-000000796917760.npz @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f1ef9c136e9898cefaaa6df4870084321541bf2ec69d792395f8767bfa3e1e04 +oid sha256:600cd7992e36f20d90389b05886d1cc04a6f850ebfb52afefd44ef951bfafc97 size 21142 diff --git a/metrics/npz/val/step-000000838860800.npz b/metrics/npz/val/step-000000838860800.npz index 5d051930509cb99a37f5a1d9e4ee495bc043a43c..321377ab0c761ccca5d65b6abecc4d7ac12142c5 100644 --- a/metrics/npz/val/step-000000838860800.npz +++ b/metrics/npz/val/step-000000838860800.npz @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:fa0ce3f6d268c11d516ec7e4e45983265493df941665d602e71a65aa132b3414 +oid sha256:0c6b96794d698ff592c22f95b0125d8b6f0489d5f51f3b62fbc47557b868c558 size 21142 diff --git a/metrics/npz/val/step-000000880803840.npz b/metrics/npz/val/step-000000880803840.npz index 5f5682b791c0cfd114fc19524e840d94681bf48b..9b2fa1d519041297e304e46dbf72345c3599c760 100644 --- a/metrics/npz/val/step-000000880803840.npz +++ b/metrics/npz/val/step-000000880803840.npz @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:471cb952fd3c733e9092a6fda7cfcd7bce5f86b1da9d21b20e51a1b1d2b20cc9 +oid sha256:25bbd80de0ced94f572ac2cd83c6c0556da51e2a5e7cadee82a0fba50c3fbbf8 size 21142 diff --git a/metrics/npz/val/step-000000922746880.npz b/metrics/npz/val/step-000000922746880.npz index 8ad75a71da91afc06130cd1c5346d39fe4627682..4aae3a77b99ce6618a7471a3166248317e681f07 100644 --- a/metrics/npz/val/step-000000922746880.npz +++ b/metrics/npz/val/step-000000922746880.npz @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5cd2cf6e03f645560cf521b87aa618238e0b9543fe7c6d7fc375884e47484a64 +oid sha256:9d148434d59dd731ae973edd769b1f6359fa6dc3692d9b6d1b3655468deca480 size 21142 diff --git a/metrics/npz/val/step-000000964689920.npz b/metrics/npz/val/step-000000964689920.npz index 212bb68288e5cb4fd5107fefbaa254a8e0e0605d..76a15005ba45941dc4c1e56f3a92ae52ba709c36 100644 --- a/metrics/npz/val/step-000000964689920.npz +++ b/metrics/npz/val/step-000000964689920.npz @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ae9695a1f3c5f4dbc8eae23dab29c70af29923a7f123cfa4c9812492ab5697ab +oid sha256:d62fb7ad3f153489d979ac9625e4640ec2b263b6e407fdf73ac3758738898f49 size 21142 diff --git a/metrics/npz/val/step-000001006632960.npz b/metrics/npz/val/step-000001006632960.npz index 2b075967a6e5c14a0e16f48388b71a99c3e6f175..17ff9b5422b27294a3b0b5c9d955bb304d345811 100644 --- a/metrics/npz/val/step-000001006632960.npz +++ b/metrics/npz/val/step-000001006632960.npz @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b05c0255b2f0659d869ec89f46d3ea19d89fbb5349f0f9d4a58dbdc5ad42b7e3 +oid sha256:faf0b3031f88171fdcd9cb65072f1be8f15f961665815b706695529233662255 size 21142 diff --git a/metrics/npz/val/step-000001048576000.npz b/metrics/npz/val/step-000001048576000.npz index 7375b45cc9fdedb47df27d24dc875181b1a57fa8..0fa7fca5d63d33e4897da3481857af88eb13a225 100644 --- a/metrics/npz/val/step-000001048576000.npz +++ b/metrics/npz/val/step-000001048576000.npz @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6e264caab0ebedf20383f0334fae40a2b238cbd1118b1b8f2ecf5ca5c4fca05c +oid sha256:f109d034e494ec5e29c08af10ee129e78a38d39ad6d5b8fa3d3e17cb00f4c9f6 size 21142 diff --git a/metrics/npz/val/step-000001090519040.npz b/metrics/npz/val/step-000001090519040.npz index 51564da873a962350a40b19996bf2c9f1f746f46..bc6291f1699cdc00fd7041664fb9ac37189cf135 100644 --- a/metrics/npz/val/step-000001090519040.npz +++ b/metrics/npz/val/step-000001090519040.npz @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6d86a113732a067133d7a0b7b4b69c03d9077de6cf6760f5a5825ea664f6fe6e +oid sha256:e2180250ac213cb2c6c46a197bee773e0622120beb0e6d48f49d344087baca16 size 21142 diff --git a/metrics/npz/val/step-000001132462080.npz b/metrics/npz/val/step-000001132462080.npz index 24b3566d48048ee23dfab49d8d1f5b18e32a7d08..67211062ad52f433666aaf7edd56ff56511759bc 100644 --- a/metrics/npz/val/step-000001132462080.npz +++ b/metrics/npz/val/step-000001132462080.npz @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c87f40336661d2ad733f947029b7ffa045061957dc1001a481af0954fbcfef81 +oid sha256:48ff14bb75529d4d1037db27e4839952b1c98efc6fd086e553094569913f7354 size 21142 diff --git a/metrics/npz/val/step-000001174405120.npz b/metrics/npz/val/step-000001174405120.npz index 5a349ee1f0275e73c4b57bf55f971fa7c21851a8..aede3d95047443bdbc87c7a4676b995b248f9fb5 100644 --- a/metrics/npz/val/step-000001174405120.npz +++ b/metrics/npz/val/step-000001174405120.npz @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f04e9a582538b5ae8d9496abe54c04e503b3055a81db4d6efb4c3c32b3431041 +oid sha256:f275e84a8146fe95c8a8b272707885888c3028a600f8140c5d0b01d56ae5bbbf size 21142 diff --git a/metrics/npz/val/step-000001216348160.npz b/metrics/npz/val/step-000001216348160.npz index b8cafeae26ae8441c6506cf8159f2e177018a7da..46c874f91c098c8ba9ad33c5a328f620bef3cdaa 100644 --- a/metrics/npz/val/step-000001216348160.npz +++ b/metrics/npz/val/step-000001216348160.npz @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2c27738de0b97a553f543ae29fd462da3d21db80eb9954afd0e33dfd57c1572e +oid sha256:80f04ed4c17bfe3ebba3674d07acf4605632bc4995fdbbaa3aacb66d4e1bb7b0 size 21142 diff --git a/metrics/npz/val/step-000001258291200.npz b/metrics/npz/val/step-000001258291200.npz index 0cf806132f2a38129e9de19502ac6d098e6382d6..c9176b55ffc3f11ff6c1d35f4934353e06ff56cd 100644 --- a/metrics/npz/val/step-000001258291200.npz +++ b/metrics/npz/val/step-000001258291200.npz @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c545c4e48627f66cac7ed82782285f0b5c2c9deb10246b137c6d71411469c1d1 +oid sha256:14b8188f5abaadf079adefc306b41459652465c0dd90a2e1d9f512d1b5892d81 size 21142 diff --git a/metrics/npz/val/step-000001300234240.npz b/metrics/npz/val/step-000001300234240.npz index 256bc7d0c6953fdbba353de0f0e165cd086f5bb3..7b09f74442ee7768cd59b17b432d0263ab01c8ea 100644 --- a/metrics/npz/val/step-000001300234240.npz +++ b/metrics/npz/val/step-000001300234240.npz @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:34381dcec5bf86cefb757f71bcb2717533f7ce1cf91822743ff5c7f872299021 +oid sha256:a68c9637485b16926bfbdce63536307e52bc956fb21ec649a8216e81092e87e0 size 21142 diff --git a/metrics/npz/val/step-000001342177280.npz b/metrics/npz/val/step-000001342177280.npz index e7d0a9febfe545b57ad38c82ef970b70c9d9d4ed..15d2141b3b307403c4e97cd6768be6bbe593649f 100644 --- a/metrics/npz/val/step-000001342177280.npz +++ b/metrics/npz/val/step-000001342177280.npz @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b0d6fdcdfe91a55470b310bb933854daf0e2aafaa97ba682981c3109cf8bc398 +oid sha256:c32124e5ec840655ce08db97edfe696246a9a36ad73d677a1c8608fadd15954c size 21142 diff --git a/metrics/npz/val/step-000001384120320.npz b/metrics/npz/val/step-000001384120320.npz index 19556f100cc86edcc27ba0753ef60802318745d5..592d2049a6090ec1384f0e25d9645acc9a461a31 100644 --- a/metrics/npz/val/step-000001384120320.npz +++ b/metrics/npz/val/step-000001384120320.npz @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8d160f78303f9e5cb4f79e23164661000d8c194e455ad9dd03019f53e845448b +oid sha256:36dec67550dc63ed1818e6150f46fade994253a39d444e7f35d1f6675e5ba074 size 21142 diff --git a/metrics/npz/val/step-000001426063360.npz b/metrics/npz/val/step-000001426063360.npz index 2a3c066c155fc23638e7a7a716d462a2db6811ee..1368a6bf0583187a507a71f38555aef39d21a52e 100644 --- a/metrics/npz/val/step-000001426063360.npz +++ b/metrics/npz/val/step-000001426063360.npz @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:973d50d3c06f26a42fd8421083fdf2918241327333d30f64db2dae8a1c92a0e0 +oid sha256:8ea96db4cc9dfd3d3047f47fc9a9c2bb1a264f316fcbc9d663ed3418dacabd37 size 21142 diff --git a/metrics/npz/val/step-000001468006400.npz b/metrics/npz/val/step-000001468006400.npz index a165eb2737de3aafe01074460ad6991ba0076145..b8fbb3efb01777641f205d81e50390bb1752ebcf 100644 --- a/metrics/npz/val/step-000001468006400.npz +++ b/metrics/npz/val/step-000001468006400.npz @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:111b08bd70250279d02c6f4312fcfc8d367aac996b7be165c5cbeb9b4c91a925 +oid sha256:a45fe83adb7d85dd0a24ea910cc80a84754516b809627e481e345759c3415f4b size 21142 diff --git a/metrics/npz/val/step-000001509949440.npz b/metrics/npz/val/step-000001509949440.npz index a9de678b77d87ed955328010efc2019d28874734..c45dc12b0fa687fd5e4d974c50340a29579a7536 100644 --- a/metrics/npz/val/step-000001509949440.npz +++ b/metrics/npz/val/step-000001509949440.npz @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:31628234c08aabc3701b0974073ac2f70f73d4ee7dace84079c00f1db36af0d4 +oid sha256:8f29017fd48ee96ae181fd7f5dd0ef9b830976869cb1859a92ce10274a10b35a size 21142 diff --git a/metrics/npz/val/step-000001551892480.npz b/metrics/npz/val/step-000001551892480.npz index 2d9ef17dd67da479e53928c568172654729e1849..a16777b1e2c8aeb5017f7fce490b1b6bf5a90cc4 100644 --- a/metrics/npz/val/step-000001551892480.npz +++ b/metrics/npz/val/step-000001551892480.npz @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d1b7e70ba0a92a99eca2a3ba0fa83ee1e9ccfc146d9b4dbb86aa72e1745982ca +oid sha256:fb508fb717805563ec995d60b3d35b257c3891dd08636e535577f921d150ba8a size 21142 diff --git a/metrics/npz/val/step-000001593835520.npz b/metrics/npz/val/step-000001593835520.npz index 1c3d6a17258cd55c7044cf9df14b307848afdd18..89b9713f5d8e9e35ed3bb0eae72c5518c211b52a 100644 --- a/metrics/npz/val/step-000001593835520.npz +++ b/metrics/npz/val/step-000001593835520.npz @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8019232ecdd6936b38091a71d25b51fb8004253f3431cc12a825e097c8961ab0 +oid sha256:64492351e23cfe7742583969950cc20d0ddbb2565ed54bf17427f6fc683c9427 size 21142 diff --git a/metrics/npz/val/step-000001635778560.npz b/metrics/npz/val/step-000001635778560.npz index ffc1c1efa6902c516bb5f05219df3b6b486f4f1c..f34da30e3145951db8d5b1fc7bcc5776ca1477b1 100644 --- a/metrics/npz/val/step-000001635778560.npz +++ b/metrics/npz/val/step-000001635778560.npz @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4e203c902c4c1e214b33f97b0fd02e2384c85678efcf7d6a8d4db3ba4d7fe14e +oid sha256:81d98bf1a060b47ce8e8d9b7bbab4da41344e60b09937f120a7394581c32fe17 size 21142 diff --git a/metrics/npz/val/step-000001677721600.npz b/metrics/npz/val/step-000001677721600.npz index 78a73d69def60521d8166011b5f2b24222f89db8..afc96e983ffb09c1ec88e63bdf2b07d0b63ecd47 100644 --- a/metrics/npz/val/step-000001677721600.npz +++ b/metrics/npz/val/step-000001677721600.npz @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ef880e063bb684ae1f4309041df00041a05ff00b7e5d7e7f89b80ca27885b642 +oid sha256:cda3545aea7bc90d245826e93eb1b873b9a984109eff28448528bf04cfcdae17 size 21142 diff --git a/metrics/npz/val/step-000001719664640.npz b/metrics/npz/val/step-000001719664640.npz index c61f2d877dc86b73c49b1a58aecaafffbf478f4b..18896e52cd12b58d101f4348f4a6cd9d807fd941 100644 --- a/metrics/npz/val/step-000001719664640.npz +++ b/metrics/npz/val/step-000001719664640.npz @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f880599793da83202dc31e54eb6a4626cad271171c08d1a0039787c68f630169 +oid sha256:48093d496384f8a8275e2c2092cb06f72196c049d660634c461bcc27e858461b size 21142 diff --git a/metrics/npz/val/step-000001761607680.npz b/metrics/npz/val/step-000001761607680.npz index 3c180926159967ee966bb8328215e6f2e51ea2da..a9e5b7795cf63a16a29b810b1502440ea1303733 100644 --- a/metrics/npz/val/step-000001761607680.npz +++ b/metrics/npz/val/step-000001761607680.npz @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ecf14004501e94612509b30afa71f39ba550c3d0701eb838c7ec636bd44bee7f +oid sha256:96d54f8fde323c5777a888b573bf15b429ef424c0947f0d1727b3b4f7b0235a9 size 21142 diff --git a/metrics/npz/val/step-000001803550720.npz b/metrics/npz/val/step-000001803550720.npz index 47cc9c4731488e9ada441491af680311c5e46b0b..467db8cd5f9abb9b316102d36e66e9c743f7a38f 100644 --- a/metrics/npz/val/step-000001803550720.npz +++ b/metrics/npz/val/step-000001803550720.npz @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:edf9e6b6e8e09d41616ef63d79a147dbaeaff3ba83a1906791c55901010ecacc +oid sha256:d94998d01432cfcfc4fc1db78476a889001b6f666c37985ae4e940aefbe444bf size 21142 diff --git a/metrics/npz/val/step-000001845493760.npz b/metrics/npz/val/step-000001845493760.npz index 29b51c1d348f7c25b47e4a7bc18a80681bc26611..55970cb9b15f3efbe5ebccd2905c8c69b823dc86 100644 --- a/metrics/npz/val/step-000001845493760.npz +++ b/metrics/npz/val/step-000001845493760.npz @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:548b3dedabd89d467c9f1bede6ee28946b1a390afc654a03fb07541329ae7230 +oid sha256:1fb31f66a1a7fbb940b24508a2f3c5472335ce7c9a46a199f1548659a2292f27 size 21142 diff --git a/metrics/npz/val/step-000001887436800.npz b/metrics/npz/val/step-000001887436800.npz index 5adbcfaae4ea6fe25bef5dbaafb7b0e852db9490..8c1d73f0dc48dd3209a88255fd407f060b596b6d 100644 --- a/metrics/npz/val/step-000001887436800.npz +++ b/metrics/npz/val/step-000001887436800.npz @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:cffb6e3b0047f49286e70b10248cb4e09f71e89e58a0b29918a5099069a4883a +oid sha256:4a5f2dd399fb14bc576a08c66525320e29a1a085d0087fb0c70b44cb391b8a19 size 21142 diff --git a/metrics/npz/val/step-000001929379840.npz b/metrics/npz/val/step-000001929379840.npz index e9f694dd43059000a1923892bf3a40f50886821f..96a714e35cce10e0296fe8217e246d86c2c3a0ec 100644 --- a/metrics/npz/val/step-000001929379840.npz +++ b/metrics/npz/val/step-000001929379840.npz @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4c52d474cfd4ce337b85888b418016a232d31076d2cc87a8cbdaf4a2bcabb25a +oid sha256:c498cbd2bd8bf590f8f4763e8446c2dcf8968848fa596c995b840ce3bc16315e size 21142 diff --git a/metrics/npz/val/step-000001971322880.npz b/metrics/npz/val/step-000001971322880.npz index 17c15db24f93ee105da983e8d29f87a6451c08ae..ffdec856f5420d667f43e376b0f4dad986bcb195 100644 --- a/metrics/npz/val/step-000001971322880.npz +++ b/metrics/npz/val/step-000001971322880.npz @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5928abad6e0cc8b7dbb0d60888e888774a68e56aa900bf75432637b82d859fb8 +oid sha256:e393586ff90b5a2931dc455b1201c506ac34d55acff352e4fee2b43e68f79db7 size 21142 diff --git a/metrics/npz/val/step-000002013265920.npz b/metrics/npz/val/step-000002013265920.npz index ab1d40dd4bc874e2b79c89384aa2db5783c7e318..c8c4051300675a0460d7cb526f99eb481539280e 100644 --- a/metrics/npz/val/step-000002013265920.npz +++ b/metrics/npz/val/step-000002013265920.npz @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:64cfc532c9a1fbbb772cd5f699808b3cde889f358a715338f58b5cf203f86a24 +oid sha256:ec33d5a53041f764cec4b7f404c3ab5fa463016a0a72a8b06ea27b0479ffc043 size 21142 diff --git a/metrics/npz/val/step-000002055208960.npz b/metrics/npz/val/step-000002055208960.npz index 2196a22baf430ee463ccd4c965eb8b21e49469ee..92d9c23afbdf1c3975bb8d7b8d4d4b586d2bf778 100644 --- a/metrics/npz/val/step-000002055208960.npz +++ b/metrics/npz/val/step-000002055208960.npz @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6070cdf59662ea38ed57ba13ea9766d6e5e0c109bb2fdb6eebbecf1240efe7c1 +oid sha256:36dbb5d52ee5b3f055f0a03a75a6d034a1be97fd5b764665b2d638395bcc1065 size 21142 diff --git a/metrics/wandb/wandb_run_id.txt b/metrics/wandb/wandb_run_id.txt index 8f5c79154d8d38f6942b456c02031b36c226db1e..7245db3fa7783ecae76764f1c139acac0a152984 100644 --- a/metrics/wandb/wandb_run_id.txt +++ b/metrics/wandb/wandb_run_id.txt @@ -1 +1 @@ -6hdpsmpp \ No newline at end of file +pun8f82u \ No newline at end of file diff --git a/model.txt b/model.txt index d5a7180d0a8413695e4ff6cade7069e0e5926476..db510c44c2c5a6b0e4cb36fbec000f950faa77da 100644 --- a/model.txt +++ b/model.txt @@ -1,24 +1,22 @@ AlibiForCausalLM( - (model): AlibiModel( - (embeddings): Embedding(50277, 256) - (layers): ModuleList( - (0-1): 2 x TransformerBlock( - (attn_norm): RMSNorm(256, eps=1e-06) - (attn): Attention( - (q_proj): Linear(in_features=256, out_features=256, bias=False) - (k_proj): Linear(in_features=256, out_features=256, bias=False) - (v_proj): Linear(in_features=256, out_features=256, bias=False) - (o_proj): Linear(in_features=256, out_features=256, bias=False) - ) - (mlp_norm): RMSNorm(256, eps=1e-06) - (mlp): TransformerMLP( - (gate_proj): Linear(in_features=256, out_features=1536, bias=False) - (down_proj): Linear(in_features=768, out_features=256, bias=False) - (act_fn): SiLU() - ) + (emb): Embedding(50277, 256) + (layers): ModuleList( + (0-1): 2 x TransformerBlock( + (attn_norm): RMSNorm(256, eps=1e-06) + (attn): Attention( + (q_proj): Linear(in_features=256, out_features=256, bias=False) + (k_proj): Linear(in_features=256, out_features=256, bias=False) + (v_proj): Linear(in_features=256, out_features=256, bias=False) + (o_proj): Linear(in_features=256, out_features=256, bias=False) + ) + (mlp_norm): RMSNorm(256, eps=1e-06) + (mlp): TransformerMLP( + (gate_proj): Linear(in_features=256, out_features=1536, bias=False) + (down_proj): Linear(in_features=768, out_features=256, bias=False) + (act_fn): SiLU() ) ) - (norm): RMSNorm(256, eps=1e-06) ) + (norm): RMSNorm(256, eps=1e-06) (lm_head): Linear(in_features=256, out_features=50277, bias=False) ) diff --git a/no_decay_params.txt b/no_decay_params.txt index 7acf4201eb540c492bd36f923aa1466d9d07cb09..83056d891dbdc5a102994f71ed0098a627804cfc 100644 --- a/no_decay_params.txt +++ b/no_decay_params.txt @@ -1,5 +1,5 @@ -_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight -_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight -_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight -_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight -_forward_module._fsdp_wrapped_module.model.norm.weight +_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight +_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight +_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight +_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight +_forward_module._fsdp_wrapped_module.norm.weight