diff --git a/checkpoints/step-000000209715200.pt b/checkpoints/step-000000209715200.pt index 6c8ac1ae9f244f0a93927a8f094deae715e76444..2237b7f23452243c42117d79358e8fc98cf629f3 100644 --- a/checkpoints/step-000000209715200.pt +++ b/checkpoints/step-000000209715200.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3b3dc0d8bd5520cae4d864552463a2886fa07b73cdc47d2a37ce83bb212dc323 -size 329410370 +oid sha256:8285b1275f636f612d151dec02c5d81f4b8c1db325da20ed27c3c0906cdcfd0b +size 329409666 diff --git a/checkpoints/step-000000419430400.pt b/checkpoints/step-000000419430400.pt index af6d783254c2f260a79e32307bdddad0dd5f9992..0531bd36b43729a26e2dc2308677d9120837e59a 100644 --- a/checkpoints/step-000000419430400.pt +++ b/checkpoints/step-000000419430400.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:197c3ce85adaa7c75aca1bc28dba6de4e903f1932840a3477ccbfe5081dd4411 -size 329410370 +oid sha256:5d1d5a3eb419481d7280395e792e83aec56b0f40085ed2e8c52bd911b3fb3be0 +size 329409666 diff --git a/checkpoints/step-000000629145600.pt b/checkpoints/step-000000629145600.pt index fcae7a0137591b68097c006047006e91a6714a9a..288e8086789eb9eefb7d00eaddf5894ccbda808c 100644 --- a/checkpoints/step-000000629145600.pt +++ b/checkpoints/step-000000629145600.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3ffa926c857f4187d24fb8c1476f007f3115e50ef587c9e0addd80d3a5cde704 -size 329410370 +oid sha256:135e47df1a60a84d73797ebba91af1b93cc5a876edb699f61fdaebf4c735a1da +size 329409666 diff --git a/checkpoints/step-000000838860800.pt b/checkpoints/step-000000838860800.pt index 790a4bfaf1b5ecbfc75610e39fbf42c44e8e9bfb..e544607df0edff9eee8b850552548216d6f28f5a 100644 --- a/checkpoints/step-000000838860800.pt +++ b/checkpoints/step-000000838860800.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6bbf8089487ef6155cd623dc477cbcb09d8e14350f5dd57ac8188890a9c1cc47 -size 329410370 +oid sha256:849a49cdb5bf1ef2b9da3c7ed39cb6ed724793a95f400f524d37fd50a631e3a6 +size 329409666 diff --git a/checkpoints/step-000001048576000.pt b/checkpoints/step-000001048576000.pt index 843af9064f118451f2874db0077bbe152631b4e0..3ab8f9c5454689081e8ddad8e8a43f75adb7ccbf 100644 --- a/checkpoints/step-000001048576000.pt +++ b/checkpoints/step-000001048576000.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:08caa24342ad8da2da4ecb5be835df6926f912ed46c8a3348ac0c43af76063a8 -size 329410370 +oid sha256:72ada714c05b103600ae9b70aba26b160d5eb815b6399ce0c5bce3f3cdc8cd63 +size 329409666 diff --git a/checkpoints/step-000001258291200.pt b/checkpoints/step-000001258291200.pt index 6b10cf4f1297327717c19024e43beb86181c8c05..39aaba29c4b741c22231c0ca23ff2e9e03692b25 100644 --- a/checkpoints/step-000001258291200.pt +++ b/checkpoints/step-000001258291200.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:dd1fd780b449e6a9a027a41fe8410eaf18503d300ed7b88ad0746df2a0301c82 -size 329410370 +oid sha256:df68891154b50e797f1ac781f9ccba307e3516009e0b4fe66b67ffb0e7811cee +size 329409666 diff --git a/checkpoints/step-000001468006400.pt b/checkpoints/step-000001468006400.pt index 3391de44fc0313bd73c3e3174cd1f1e2a2c41fd8..562888474f8c6f7f840820e7d87deb930368d1fa 100644 --- a/checkpoints/step-000001468006400.pt +++ b/checkpoints/step-000001468006400.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:33e749571b391d13f25672d73a33515e15822b76b028f65d3e0edd8942e603bb -size 329410370 +oid sha256:d8166d4070b53b6d58ce1ff21a80c08f4730d11eaeb1274a1edc9502ffe1e585 +size 329409666 diff --git a/checkpoints/step-000001677721600.pt b/checkpoints/step-000001677721600.pt index d689c19c3f207fd1842099824d0da5bd8359d849..901ca5ead83e57ad1a458fb46bd43586e38f7967 100644 --- a/checkpoints/step-000001677721600.pt +++ b/checkpoints/step-000001677721600.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:476798da159b1371c534ffcd6919778429ba1178a2bde18a0609f87a832989f3 -size 329410370 +oid sha256:0812b579499302700f9a59b19cca469b7b235b15091dd742378e33e33d6e3734 +size 329409666 diff --git a/checkpoints/step-000001887436800.pt b/checkpoints/step-000001887436800.pt index c38cef0f8112a4d6537128978cc22f864ad94760..a8c0d6d6af9ffdb23a10a5cd80b3fe3a74f56594 100644 --- a/checkpoints/step-000001887436800.pt +++ b/checkpoints/step-000001887436800.pt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5bbd8a2e9bf71df863966b176483676d325b894dc43f9defbebba152af38ea84 -size 329410370 +oid sha256:78266535cd097dcd343cfc17a80c83741e912e4e4afcb575e500e817fdcfe2f6 +size 329409666 diff --git a/decay_params.txt b/decay_params.txt index 594174bb9c7c453d9bfca41187ccdaf55c0f9b80..da0fadf570ed74e3166b3325c3da23358bfba211 100644 --- a/decay_params.txt +++ b/decay_params.txt @@ -1,14 +1,14 @@ -_forward_module._fsdp_wrapped_module.model.embeddings.weight -_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight -_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight -_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight -_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight -_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight -_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight -_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight -_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight -_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight -_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight -_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight -_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight +_forward_module._fsdp_wrapped_module.emb.weight +_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight +_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight +_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight +_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight +_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight +_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight +_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight +_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight +_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight +_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight +_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight +_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight _forward_module._fsdp_wrapped_module.lm_head.weight diff --git a/logs/2025-10-28_01-48-25.log b/logs/2025-10-28_01-48-25.log new file mode 100644 index 0000000000000000000000000000000000000000..67282d6c8ba73b435716c9bb34dcad32def328e7 --- /dev/null +++ b/logs/2025-10-28_01-48-25.log @@ -0,0 +1,262 @@ +[2025-10-28 01:48:25][train:372][INFO] All outputs will be saved to `/workspace/forgetting-transformer/alibi_2_4_256` +[2025-10-28 01:48:25][train:375][INFO] Configuration: +[2025-10-28 01:48:25][train:380][INFO] Configuration saved to /workspace/forgetting-transformer/alibi_2_4_256/config.yaml. +[2025-10-28 01:48:25][train:387][INFO] creating datamodule +[2025-10-28 01:48:25][train:419][INFO] creating model +[2025-10-28 01:48:25][train:440][INFO] creating optimizer +[2025-10-28 01:48:25][checkpoint:39][INFO] Not resuming. Deleting existing checkpoints... +[2025-10-28 01:48:25][logger:256][INFO] Setting up wandb logger... +[2025-10-28 01:48:25][logger:272][INFO] Not resuming. Creating a new wandb run. +[2025-10-28 01:48:26][logger:288][INFO] wandb initialized. Run id: gwmp4t3h +[2025-10-28 01:48:26][logger:186][INFO] Setting up jsonlines logger... +[2025-10-28 01:48:26][logger:199][INFO] Deleting /workspace/forgetting-transformer/alibi_2_4_256/metrics/jsonlines/resume.jsonl since we are not resuming +[2025-10-28 01:48:26][logger:199][INFO] Deleting /workspace/forgetting-transformer/alibi_2_4_256/metrics/jsonlines/train_data_info.jsonl since we are not resuming +[2025-10-28 01:48:26][logger:199][INFO] Deleting /workspace/forgetting-transformer/alibi_2_4_256/metrics/jsonlines/val_data_info.jsonl since we are not resuming +[2025-10-28 01:48:26][logger:199][INFO] Deleting /workspace/forgetting-transformer/alibi_2_4_256/metrics/jsonlines/model_info.jsonl since we are not resuming +[2025-10-28 01:48:26][logger:113][INFO] Setting up npz logger... +[2025-10-28 01:48:26][logger:171][INFO] [step: 0] [train_data_info/vocab_size: 50277] [train_data_info/global_tokens_per_batch: 2097152] [train_data_info/local_tokens_per_batch: 2097152] [train_data_info/batch_len: 2048] [train_data_info/seq_len: 2048] [train_data_info/total_tokens: 2055208960] [train_data_info/global_batch_size: 1024] [train_data_info/local_batch_size: 1024] +[2025-10-28 01:48:26][logger:171][INFO] [step: 0] [val_data_info/vocab_size: 50277] [val_data_info/global_tokens_per_batch: 2048] [val_data_info/local_tokens_per_batch: 2048] [val_data_info/batch_len: 2048] [val_data_info/seq_len: 2048] [val_data_info/total_tokens: 2147483648] [val_data_info/global_batch_size: 1] [val_data_info/local_batch_size: 1] +[2025-10-28 01:48:26][logger:171][INFO] [step: 0] [model_info/total_params: 27447040] [model_info/trainable_params: 27447040] [model_info/embedding_params: 12870912] [model_info/flops_per_token: 0] [model_info/non_embedding_params: 14576128] +[2025-10-28 01:49:37][utils:57][INFO] [P: 1.00%] [S: 20971520/2097152000] [T: 0:01:09] [ETA: 1:55:28] [loss: 9.774] [tokens/s: 309942.692] [batches/s: 0.148] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-28 01:50:44][utils:57][INFO] [P: 2.00%] [S: 41943040/2097152000] [T: 0:02:17] [ETA: 1:52:23] [loss: 8.196] [tokens/s: 309977.378] [batches/s: 0.148] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-28 01:50:44][train:194][INFO] Running validation... +[2025-10-28 01:52:25][logger:171][INFO] [step: 41943040] [val/train_token_count: 41943040] [val/train_batch_count: 20] [val/train_flop_count: 0] [val/train_total_time: 137.633] [val/train_update_time: 137.344] [val/loss: 8.098] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 100.750] [val/val_tokens_per_second: 406549.998] [val/loss_avg_len_2048: 8.098] [val/perplexity_len_2048: 3287.787] [val/loss_avg_len_1024: 8.101] [val/perplexity_len_1024: 3296.261] [val/loss_avg_len_512: 8.101] [val/perplexity_len_512: 3297.223] +[2025-10-28 01:53:33][utils:57][INFO] [P: 3.00%] [S: 62914560/2097152000] [T: 0:05:06] [ETA: 2:44:55] [loss: 7.712] [tokens/s: 204800.058] [batches/s: 0.098] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-28 01:54:40][utils:57][INFO] [P: 4.00%] [S: 83886080/2097152000] [T: 0:06:13] [ETA: 2:29:28] [loss: 7.524] [tokens/s: 224321.330] [batches/s: 0.107] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-28 01:54:40][train:194][INFO] Running validation... +[2025-10-28 01:56:21][logger:171][INFO] [step: 83886080] [val/train_token_count: 83886080] [val/train_batch_count: 40] [val/train_flop_count: 0] [val/train_total_time: 373.694] [val/train_update_time: 272.436] [val/loss: 7.510] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 100.374] [val/val_tokens_per_second: 408074.046] [val/loss_avg_len_2048: 7.510] [val/perplexity_len_2048: 1825.331] [val/loss_avg_len_1024: 7.513] [val/perplexity_len_1024: 1831.575] [val/loss_avg_len_512: 7.515] [val/perplexity_len_512: 1834.639] +[2025-10-28 01:57:28][utils:57][INFO] [P: 5.00%] [S: 104857600/2097152000] [T: 0:09:01] [ETA: 2:51:32] [loss: 7.361] [tokens/s: 192927.320] [batches/s: 0.092] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-28 01:57:28][logger:171][INFO] [step: 104857600] [train_eval/train_token_count: 104857600] [train_eval/train_batch_count: 50] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 541.726] [train_eval/train_update_time: 339.966] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 8.431] [train_eval/perplexity_len_2048: 4588.366] [train_eval/loss_avg_len_1024: 8.436] [train_eval/perplexity_len_1024: 4609.090] [train_eval/loss_avg_len_512: 8.436] [train_eval/perplexity_len_512: 4609.426] +[2025-10-28 01:58:36][utils:57][INFO] [P: 6.00%] [S: 125829120/2097152000] [T: 0:10:09] [ETA: 2:39:06] [loss: 7.195] [tokens/s: 206123.202] [batches/s: 0.098] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-28 01:58:36][train:194][INFO] Running validation... +[2025-10-28 02:00:16][logger:171][INFO] [step: 125829120] [val/train_token_count: 125829120] [val/train_batch_count: 60] [val/train_flop_count: 0] [val/train_total_time: 609.370] [val/train_update_time: 407.492] [val/loss: 7.194] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 99.850] [val/val_tokens_per_second: 410215.493] [val/loss_avg_len_2048: 7.194] [val/perplexity_len_2048: 1331.632] [val/loss_avg_len_1024: 7.199] [val/perplexity_len_1024: 1338.256] [val/loss_avg_len_512: 7.204] [val/perplexity_len_512: 1345.080] +[2025-10-28 02:01:23][utils:57][INFO] [P: 7.00%] [S: 146800640/2097152000] [T: 0:12:56] [ETA: 2:52:00] [loss: 7.097] [tokens/s: 188480.049] [batches/s: 0.090] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-28 02:02:31][utils:57][INFO] [P: 8.00%] [S: 167772160/2097152000] [T: 0:14:04] [ETA: 2:41:51] [loss: 6.985] [tokens/s: 198329.429] [batches/s: 0.095] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-28 02:02:31][train:194][INFO] Running validation... +[2025-10-28 02:04:12][logger:171][INFO] [step: 167772160] [val/train_token_count: 167772160] [val/train_batch_count: 80] [val/train_flop_count: 0] [val/train_total_time: 844.441] [val/train_update_time: 542.489] [val/loss: 6.967] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 100.669] [val/val_tokens_per_second: 406876.866] [val/loss_avg_len_2048: 6.967] [val/perplexity_len_2048: 1060.792] [val/loss_avg_len_1024: 6.973] [val/perplexity_len_1024: 1067.275] [val/loss_avg_len_512: 6.980] [val/perplexity_len_512: 1074.812] +[2025-10-28 02:05:19][utils:57][INFO] [P: 9.00%] [S: 188743680/2097152000] [T: 0:16:52] [ETA: 2:50:39] [loss: 6.864] [tokens/s: 185967.467] [batches/s: 0.089] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-28 02:06:27][utils:57][INFO] [P: 10.00%] [S: 209715200/2097152000] [T: 0:18:00] [ETA: 2:42:03] [loss: 6.781] [tokens/s: 193805.258] [batches/s: 0.092] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-28 02:06:27][logger:171][INFO] [step: 209715200] [train_eval/train_token_count: 209715200] [train_eval/train_batch_count: 100] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 1080.360] [train_eval/train_update_time: 677.513] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 7.043] [train_eval/perplexity_len_2048: 1145.164] [train_eval/loss_avg_len_1024: 7.050] [train_eval/perplexity_len_1024: 1152.943] [train_eval/loss_avg_len_512: 7.055] [train_eval/perplexity_len_512: 1158.738] +[2025-10-28 02:06:27][train:194][INFO] Running validation... +[2025-10-28 02:08:07][logger:171][INFO] [step: 209715200] [val/train_token_count: 209715200] [val/train_batch_count: 100] [val/train_flop_count: 0] [val/train_total_time: 1080.360] [val/train_update_time: 677.513] [val/loss: 6.774] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 100.503] [val/val_tokens_per_second: 407549.903] [val/loss_avg_len_2048: 6.774] [val/perplexity_len_2048: 874.407] [val/loss_avg_len_1024: 6.781] [val/perplexity_len_1024: 880.950] [val/loss_avg_len_512: 6.790] [val/perplexity_len_512: 888.990] +[2025-10-28 02:08:07][checkpoint:111][INFO] Saving checkpoint to /workspace/forgetting-transformer/alibi_2_4_256/checkpoints/step-000000209715200.pt... +[2025-10-28 02:08:08][checkpoint:128][INFO] Checkpoint saved to /workspace/forgetting-transformer/alibi_2_4_256/checkpoints/step-000000209715200.pt. +[2025-10-28 02:08:08][logger:171][INFO] [step: 209715200] [checkpoint/checkpoint_time: 0.455] +[2025-10-28 02:09:16][utils:57][INFO] [P: 11.00%] [S: 230686720/2097152000] [T: 0:20:48] [ETA: 2:48:25] [loss: 6.698] [tokens/s: 177119.434] [batches/s: 0.084] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-28 02:10:23][utils:57][INFO] [P: 12.00%] [S: 251658240/2097152000] [T: 0:21:56] [ETA: 2:40:54] [loss: 6.593] [tokens/s: 193779.584] [batches/s: 0.092] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-28 02:10:23][train:194][INFO] Running validation... +[2025-10-28 02:12:04][logger:171][INFO] [step: 251658240] [val/train_token_count: 251658240] [val/train_batch_count: 120] [val/train_flop_count: 0] [val/train_total_time: 1316.574] [val/train_update_time: 812.517] [val/loss: 6.612] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 100.810] [val/val_tokens_per_second: 406308.306] [val/loss_avg_len_2048: 6.612] [val/perplexity_len_2048: 744.079] [val/loss_avg_len_1024: 6.621] [val/perplexity_len_1024: 750.549] [val/loss_avg_len_512: 6.632] [val/perplexity_len_512: 758.811] +[2025-10-28 02:13:12][utils:57][INFO] [P: 13.00%] [S: 272629760/2097152000] [T: 0:24:45] [ETA: 2:45:38] [loss: 6.562] [tokens/s: 177120.861] [batches/s: 0.084] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-28 02:14:19][utils:57][INFO] [P: 14.00%] [S: 293601280/2097152000] [T: 0:25:52] [ETA: 2:38:57] [loss: 6.505] [tokens/s: 193716.354] [batches/s: 0.092] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-28 02:14:19][train:194][INFO] Running validation... +[2025-10-28 02:16:00][logger:171][INFO] [step: 293601280] [val/train_token_count: 293601280] [val/train_batch_count: 140] [val/train_flop_count: 0] [val/train_total_time: 1552.606] [val/train_update_time: 947.496] [val/loss: 6.478] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 100.429] [val/val_tokens_per_second: 407851.660] [val/loss_avg_len_2048: 6.478] [val/perplexity_len_2048: 650.688] [val/loss_avg_len_1024: 6.488] [val/perplexity_len_1024: 657.094] [val/loss_avg_len_512: 6.500] [val/perplexity_len_512: 665.465] +[2025-10-28 02:17:07][utils:57][INFO] [P: 15.00%] [S: 314572800/2097152000] [T: 0:28:40] [ETA: 2:42:30] [loss: 6.406] [tokens/s: 177126.378] [batches/s: 0.084] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-28 02:17:07][logger:171][INFO] [step: 314572800] [train_eval/train_token_count: 314572800] [train_eval/train_batch_count: 150] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 1720.648] [train_eval/train_update_time: 1014.977] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 6.585] [train_eval/perplexity_len_2048: 723.963] [train_eval/loss_avg_len_1024: 6.593] [train_eval/perplexity_len_1024: 729.975] [train_eval/loss_avg_len_512: 6.604] [train_eval/perplexity_len_512: 737.766] +[2025-10-28 02:18:15][utils:57][INFO] [P: 16.00%] [S: 335544320/2097152000] [T: 0:29:48] [ETA: 2:36:28] [loss: 6.375] [tokens/s: 193628.897] [batches/s: 0.092] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-28 02:18:15][train:194][INFO] Running validation... +[2025-10-28 02:19:55][logger:171][INFO] [step: 335544320] [val/train_token_count: 335544320] [val/train_batch_count: 160] [val/train_flop_count: 0] [val/train_total_time: 1788.245] [val/train_update_time: 1082.469] [val/loss: 6.371] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 100.337] [val/val_tokens_per_second: 408224.188] [val/loss_avg_len_2048: 6.371] [val/perplexity_len_2048: 584.802] [val/loss_avg_len_1024: 6.382] [val/perplexity_len_1024: 591.124] [val/loss_avg_len_512: 6.396] [val/perplexity_len_512: 599.393] +[2025-10-28 02:21:03][utils:57][INFO] [P: 17.00%] [S: 356515840/2097152000] [T: 0:32:36] [ETA: 2:39:10] [loss: 6.300] [tokens/s: 177058.054] [batches/s: 0.084] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-28 02:22:10][utils:57][INFO] [P: 18.00%] [S: 377487360/2097152000] [T: 0:33:43] [ETA: 2:33:39] [loss: 6.269] [tokens/s: 193687.228] [batches/s: 0.092] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-28 02:22:10][train:194][INFO] Running validation... +[2025-10-28 02:23:51][logger:171][INFO] [step: 377487360] [val/train_token_count: 377487360] [val/train_batch_count: 180] [val/train_flop_count: 0] [val/train_total_time: 2023.812] [val/train_update_time: 1217.452] [val/loss: 6.274] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 100.357] [val/val_tokens_per_second: 408143.018] [val/loss_avg_len_2048: 6.274] [val/perplexity_len_2048: 530.567] [val/loss_avg_len_1024: 6.286] [val/perplexity_len_1024: 536.794] [val/loss_avg_len_512: 6.301] [val/perplexity_len_512: 545.219] +[2025-10-28 02:24:58][utils:57][INFO] [P: 19.00%] [S: 398458880/2097152000] [T: 0:36:31] [ETA: 2:35:43] [loss: 6.286] [tokens/s: 177107.743] [batches/s: 0.084] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-28 02:26:06][utils:57][INFO] [P: 20.00%] [S: 419430400/2097152000] [T: 0:37:39] [ETA: 2:30:37] [loss: 6.203] [tokens/s: 193804.458] [batches/s: 0.092] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-28 02:26:06][logger:171][INFO] [step: 419430400] [train_eval/train_token_count: 419430400] [train_eval/train_batch_count: 200] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 2259.375] [train_eval/train_update_time: 1352.433] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 6.304] [train_eval/perplexity_len_2048: 546.957] [train_eval/loss_avg_len_1024: 6.316] [train_eval/perplexity_len_1024: 553.212] [train_eval/loss_avg_len_512: 6.330] [train_eval/perplexity_len_512: 560.967] +[2025-10-28 02:26:06][train:194][INFO] Running validation... +[2025-10-28 02:27:46][logger:171][INFO] [step: 419430400] [val/train_token_count: 419430400] [val/train_batch_count: 200] [val/train_flop_count: 0] [val/train_total_time: 2259.375] [val/train_update_time: 1352.433] [val/loss: 6.200] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 100.386] [val/val_tokens_per_second: 408026.058] [val/loss_avg_len_2048: 6.200] [val/perplexity_len_2048: 492.537] [val/loss_avg_len_1024: 6.212] [val/perplexity_len_1024: 498.572] [val/loss_avg_len_512: 6.228] [val/perplexity_len_512: 506.921] +[2025-10-28 02:27:46][checkpoint:111][INFO] Saving checkpoint to /workspace/forgetting-transformer/alibi_2_4_256/checkpoints/step-000000419430400.pt... +[2025-10-28 02:27:47][checkpoint:128][INFO] Checkpoint saved to /workspace/forgetting-transformer/alibi_2_4_256/checkpoints/step-000000419430400.pt. +[2025-10-28 02:27:47][logger:171][INFO] [step: 419430400] [checkpoint/checkpoint_time: 0.442] +[2025-10-28 02:28:54][utils:57][INFO] [P: 21.00%] [S: 440401920/2097152000] [T: 0:40:27] [ETA: 2:32:13] [loss: 6.156] [tokens/s: 177134.593] [batches/s: 0.084] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-28 02:30:02][utils:57][INFO] [P: 22.00%] [S: 461373440/2097152000] [T: 0:41:35] [ETA: 2:27:27] [loss: 6.146] [tokens/s: 193804.240] [batches/s: 0.092] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-28 02:30:02][train:194][INFO] Running validation... +[2025-10-28 02:31:43][logger:171][INFO] [step: 461373440] [val/train_token_count: 461373440] [val/train_batch_count: 220] [val/train_flop_count: 0] [val/train_total_time: 2495.437] [val/train_update_time: 1487.442] [val/loss: 6.132] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 100.703] [val/val_tokens_per_second: 406740.433] [val/loss_avg_len_2048: 6.132] [val/perplexity_len_2048: 460.365] [val/loss_avg_len_1024: 6.145] [val/perplexity_len_1024: 466.377] [val/loss_avg_len_512: 6.163] [val/perplexity_len_512: 474.794] +[2025-10-28 02:32:50][utils:57][INFO] [P: 23.00%] [S: 482344960/2097152000] [T: 0:44:23] [ETA: 2:28:37] [loss: 6.099] [tokens/s: 177146.443] [batches/s: 0.084] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-28 02:33:58][utils:57][INFO] [P: 24.00%] [S: 503316480/2097152000] [T: 0:45:31] [ETA: 2:24:09] [loss: 6.058] [tokens/s: 193740.553] [batches/s: 0.092] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-28 02:33:58][train:194][INFO] Running validation... +[2025-10-28 02:35:39][logger:171][INFO] [step: 503316480] [val/train_token_count: 503316480] [val/train_batch_count: 240] [val/train_flop_count: 0] [val/train_total_time: 2731.434] [val/train_update_time: 1622.491] [val/loss: 6.077] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 100.997] [val/val_tokens_per_second: 405558.321] [val/loss_avg_len_2048: 6.077] [val/perplexity_len_2048: 435.701] [val/loss_avg_len_1024: 6.091] [val/perplexity_len_1024: 441.655] [val/loss_avg_len_512: 6.109] [val/perplexity_len_512: 450.082] +[2025-10-28 02:36:47][utils:57][INFO] [P: 25.00%] [S: 524288000/2097152000] [T: 0:48:20] [ETA: 2:25:00] [loss: 6.051] [tokens/s: 177048.600] [batches/s: 0.084] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-28 02:36:47][logger:171][INFO] [step: 524288000] [train_eval/train_token_count: 524288000] [train_eval/train_batch_count: 250] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 2900.077] [train_eval/train_update_time: 1690.001] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 6.119] [train_eval/perplexity_len_2048: 454.201] [train_eval/loss_avg_len_1024: 6.129] [train_eval/perplexity_len_1024: 458.908] [train_eval/loss_avg_len_512: 6.145] [train_eval/perplexity_len_512: 466.193] +[2025-10-28 02:37:54][utils:57][INFO] [P: 26.00%] [S: 545259520/2097152000] [T: 0:49:27] [ETA: 2:20:46] [loss: 6.019] [tokens/s: 193606.916] [batches/s: 0.092] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-28 02:37:54][train:194][INFO] Running validation... +[2025-10-28 02:39:35][logger:171][INFO] [step: 545259520] [val/train_token_count: 545259520] [val/train_batch_count: 260] [val/train_flop_count: 0] [val/train_total_time: 2967.730] [val/train_update_time: 1757.532] [val/loss: 6.025] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 100.815] [val/val_tokens_per_second: 406288.925] [val/loss_avg_len_2048: 6.025] [val/perplexity_len_2048: 413.628] [val/loss_avg_len_1024: 6.039] [val/perplexity_len_1024: 419.547] [val/loss_avg_len_512: 6.059] [val/perplexity_len_512: 427.893] +[2025-10-28 02:40:43][utils:57][INFO] [P: 27.00%] [S: 566231040/2097152000] [T: 0:52:16] [ETA: 2:21:19] [loss: 6.010] [tokens/s: 176963.151] [batches/s: 0.084] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-28 02:41:50][utils:57][INFO] [P: 28.00%] [S: 587202560/2097152000] [T: 0:53:23] [ETA: 2:17:18] [loss: 5.960] [tokens/s: 193511.888] [batches/s: 0.092] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-28 02:41:50][train:194][INFO] Running validation... +[2025-10-28 02:43:31][logger:171][INFO] [step: 587202560] [val/train_token_count: 587202560] [val/train_batch_count: 280] [val/train_flop_count: 0] [val/train_total_time: 3203.841] [val/train_update_time: 1892.586] [val/loss: 5.974] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 100.200] [val/val_tokens_per_second: 408780.967] [val/loss_avg_len_2048: 5.974] [val/perplexity_len_2048: 392.953] [val/loss_avg_len_1024: 5.988] [val/perplexity_len_1024: 398.795] [val/loss_avg_len_512: 6.009] [val/perplexity_len_512: 407.195] +[2025-10-28 02:44:38][utils:57][INFO] [P: 29.00%] [S: 608174080/2097152000] [T: 0:56:11] [ETA: 2:17:34] [loss: 5.931] [tokens/s: 176978.071] [batches/s: 0.084] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-28 02:45:46][utils:57][INFO] [P: 30.00%] [S: 629145600/2097152000] [T: 0:57:19] [ETA: 2:13:45] [loss: 5.915] [tokens/s: 193613.347] [batches/s: 0.092] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-28 02:45:46][logger:171][INFO] [step: 629145600] [train_eval/train_token_count: 629145600] [train_eval/train_batch_count: 300] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 3439.312] [train_eval/train_update_time: 2027.637] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 5.993] [train_eval/perplexity_len_2048: 400.721] [train_eval/loss_avg_len_1024: 6.003] [train_eval/perplexity_len_1024: 404.814] [train_eval/loss_avg_len_512: 6.021] [train_eval/perplexity_len_512: 411.889] +[2025-10-28 02:45:46][train:194][INFO] Running validation... +[2025-10-28 02:47:27][logger:171][INFO] [step: 629145600] [val/train_token_count: 629145600] [val/train_batch_count: 300] [val/train_flop_count: 0] [val/train_total_time: 3439.312] [val/train_update_time: 2027.637] [val/loss: 5.937] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 100.627] [val/val_tokens_per_second: 407047.134] [val/loss_avg_len_2048: 5.937] [val/perplexity_len_2048: 378.618] [val/loss_avg_len_1024: 5.952] [val/perplexity_len_1024: 384.436] [val/loss_avg_len_512: 5.974] [val/perplexity_len_512: 392.897] +[2025-10-28 02:47:27][checkpoint:111][INFO] Saving checkpoint to /workspace/forgetting-transformer/alibi_2_4_256/checkpoints/step-000000629145600.pt... +[2025-10-28 02:47:27][checkpoint:128][INFO] Checkpoint saved to /workspace/forgetting-transformer/alibi_2_4_256/checkpoints/step-000000629145600.pt. +[2025-10-28 02:47:27][logger:171][INFO] [step: 629145600] [checkpoint/checkpoint_time: 0.424] +[2025-10-28 02:48:35][utils:57][INFO] [P: 31.00%] [S: 650117120/2097152000] [T: 1:00:07] [ETA: 2:13:50] [loss: 5.956] [tokens/s: 176935.217] [batches/s: 0.084] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-28 02:49:42][utils:57][INFO] [P: 32.00%] [S: 671088640/2097152000] [T: 1:01:15] [ETA: 2:10:10] [loss: 5.876] [tokens/s: 193535.378] [batches/s: 0.092] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-28 02:49:42][train:194][INFO] Running validation... +[2025-10-28 02:51:23][logger:171][INFO] [step: 671088640] [val/train_token_count: 671088640] [val/train_batch_count: 320] [val/train_flop_count: 0] [val/train_total_time: 3675.684] [val/train_update_time: 2162.691] [val/loss: 5.899] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 100.480] [val/val_tokens_per_second: 407644.252] [val/loss_avg_len_2048: 5.899] [val/perplexity_len_2048: 364.765] [val/loss_avg_len_1024: 5.915] [val/perplexity_len_1024: 370.535] [val/loss_avg_len_512: 5.938] [val/perplexity_len_512: 379.101] +[2025-10-28 02:52:30][utils:57][INFO] [P: 33.00%] [S: 692060160/2097152000] [T: 1:04:03] [ETA: 2:10:04] [loss: 5.916] [tokens/s: 176959.550] [batches/s: 0.084] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-28 02:53:38][utils:57][INFO] [P: 34.00%] [S: 713031680/2097152000] [T: 1:05:11] [ETA: 2:06:32] [loss: 5.878] [tokens/s: 193631.925] [batches/s: 0.092] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-28 02:53:38][train:194][INFO] Running validation... +[2025-10-28 02:55:18][logger:171][INFO] [step: 713031680] [val/train_token_count: 713031680] [val/train_batch_count: 340] [val/train_flop_count: 0] [val/train_total_time: 3911.438] [val/train_update_time: 2297.710] [val/loss: 5.866] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 100.256] [val/val_tokens_per_second: 408554.595] [val/loss_avg_len_2048: 5.866] [val/perplexity_len_2048: 352.944] [val/loss_avg_len_1024: 5.882] [val/perplexity_len_1024: 358.687] [val/loss_avg_len_512: 5.906] [val/perplexity_len_512: 367.231] +[2025-10-28 02:56:26][utils:57][INFO] [P: 35.00%] [S: 734003200/2097152000] [T: 1:07:59] [ETA: 2:06:15] [loss: 5.850] [tokens/s: 177072.118] [batches/s: 0.084] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-28 02:56:26][logger:171][INFO] [step: 734003200] [train_eval/train_token_count: 734003200] [train_eval/train_batch_count: 350] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 4079.355] [train_eval/train_update_time: 2365.248] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 5.894] [train_eval/perplexity_len_2048: 362.793] [train_eval/loss_avg_len_1024: 5.908] [train_eval/perplexity_len_1024: 367.821] [train_eval/loss_avg_len_512: 5.928] [train_eval/perplexity_len_512: 375.441] +[2025-10-28 02:57:34][utils:57][INFO] [P: 36.00%] [S: 754974720/2097152000] [T: 1:09:07] [ETA: 2:02:52] [loss: 5.787] [tokens/s: 193731.211] [batches/s: 0.092] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-28 02:57:34][train:194][INFO] Running validation... +[2025-10-28 02:59:14][logger:171][INFO] [step: 754974720] [val/train_token_count: 754974720] [val/train_batch_count: 360] [val/train_flop_count: 0] [val/train_total_time: 4147.001] [val/train_update_time: 2432.774] [val/loss: 5.838] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 100.454] [val/val_tokens_per_second: 407750.244] [val/loss_avg_len_2048: 5.838] [val/perplexity_len_2048: 343.229] [val/loss_avg_len_1024: 5.855] [val/perplexity_len_1024: 348.990] [val/loss_avg_len_512: 5.879] [val/perplexity_len_512: 357.566] +[2025-10-28 03:00:22][utils:57][INFO] [P: 37.00%] [S: 775946240/2097152000] [T: 1:11:55] [ETA: 2:02:27] [loss: 5.819] [tokens/s: 177128.887] [batches/s: 0.084] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-28 03:01:29][utils:57][INFO] [P: 38.00%] [S: 796917760/2097152000] [T: 1:13:02] [ETA: 1:59:10] [loss: 5.792] [tokens/s: 193690.683] [batches/s: 0.092] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-28 03:01:29][train:194][INFO] Running validation... +[2025-10-28 03:03:10][logger:171][INFO] [step: 796917760] [val/train_token_count: 796917760] [val/train_batch_count: 380] [val/train_flop_count: 0] [val/train_total_time: 4382.723] [val/train_update_time: 2567.802] [val/loss: 5.807] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 100.155] [val/val_tokens_per_second: 408964.937] [val/loss_avg_len_2048: 5.807] [val/perplexity_len_2048: 332.631] [val/loss_avg_len_1024: 5.824] [val/perplexity_len_1024: 338.388] [val/loss_avg_len_512: 5.850] [val/perplexity_len_512: 347.116] +[2025-10-28 03:04:17][utils:57][INFO] [P: 39.00%] [S: 817889280/2097152000] [T: 1:15:50] [ETA: 1:58:37] [loss: 5.811] [tokens/s: 177137.275] [batches/s: 0.084] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-28 03:05:25][utils:57][INFO] [P: 40.00%] [S: 838860800/2097152000] [T: 1:16:58] [ETA: 1:55:27] [loss: 5.713] [tokens/s: 193852.270] [batches/s: 0.092] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-28 03:05:25][logger:171][INFO] [step: 838860800] [train_eval/train_token_count: 838860800] [train_eval/train_batch_count: 400] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 4618.154] [train_eval/train_update_time: 2702.834] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 5.817] [train_eval/perplexity_len_2048: 336.075] [train_eval/loss_avg_len_1024: 5.830] [train_eval/perplexity_len_1024: 340.317] [train_eval/loss_avg_len_512: 5.851] [train_eval/perplexity_len_512: 347.695] +[2025-10-28 03:05:25][train:194][INFO] Running validation... +[2025-10-28 03:07:05][logger:171][INFO] [step: 838860800] [val/train_token_count: 838860800] [val/train_batch_count: 400] [val/train_flop_count: 0] [val/train_total_time: 4618.154] [val/train_update_time: 2702.834] [val/loss: 5.782] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 100.075] [val/val_tokens_per_second: 409291.934] [val/loss_avg_len_2048: 5.782] [val/perplexity_len_2048: 324.540] [val/loss_avg_len_1024: 5.800] [val/perplexity_len_1024: 330.279] [val/loss_avg_len_512: 5.826] [val/perplexity_len_512: 339.046] +[2025-10-28 03:07:05][checkpoint:111][INFO] Saving checkpoint to /workspace/forgetting-transformer/alibi_2_4_256/checkpoints/step-000000838860800.pt... +[2025-10-28 03:07:05][checkpoint:128][INFO] Checkpoint saved to /workspace/forgetting-transformer/alibi_2_4_256/checkpoints/step-000000838860800.pt. +[2025-10-28 03:07:05][logger:171][INFO] [step: 838860800] [checkpoint/checkpoint_time: 0.425] +[2025-10-28 03:08:13][utils:57][INFO] [P: 41.00%] [S: 859832320/2097152000] [T: 1:19:46] [ETA: 1:54:47] [loss: 5.736] [tokens/s: 177219.607] [batches/s: 0.085] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-28 03:09:21][utils:57][INFO] [P: 42.00%] [S: 880803840/2097152000] [T: 1:20:53] [ETA: 1:51:43] [loss: 5.749] [tokens/s: 193860.014] [batches/s: 0.092] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-28 03:09:21][train:194][INFO] Running validation... +[2025-10-28 03:11:00][logger:171][INFO] [step: 880803840] [val/train_token_count: 880803840] [val/train_batch_count: 420] [val/train_flop_count: 0] [val/train_total_time: 4853.921] [val/train_update_time: 2837.866] [val/loss: 5.759] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 99.822] [val/val_tokens_per_second: 410332.277] [val/loss_avg_len_2048: 5.759] [val/perplexity_len_2048: 316.982] [val/loss_avg_len_1024: 5.777] [val/perplexity_len_1024: 322.744] [val/loss_avg_len_512: 5.804] [val/perplexity_len_512: 331.528] +[2025-10-28 03:12:08][utils:57][INFO] [P: 43.00%] [S: 901775360/2097152000] [T: 1:23:41] [ETA: 1:50:56] [loss: 5.747] [tokens/s: 177330.961] [batches/s: 0.085] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-28 03:13:16][utils:57][INFO] [P: 44.00%] [S: 922746880/2097152000] [T: 1:24:49] [ETA: 1:47:56] [loss: 5.780] [tokens/s: 193937.170] [batches/s: 0.092] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-28 03:13:16][train:194][INFO] Running validation... +[2025-10-28 03:14:56][logger:171][INFO] [step: 922746880] [val/train_token_count: 922746880] [val/train_batch_count: 440] [val/train_flop_count: 0] [val/train_total_time: 5089.012] [val/train_update_time: 2972.891] [val/loss: 5.737] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 100.327] [val/val_tokens_per_second: 408266.010] [val/loss_avg_len_2048: 5.737] [val/perplexity_len_2048: 310.286] [val/loss_avg_len_1024: 5.756] [val/perplexity_len_1024: 316.032] [val/loss_avg_len_512: 5.784] [val/perplexity_len_512: 324.904] +[2025-10-28 03:16:04][utils:57][INFO] [P: 45.00%] [S: 943718400/2097152000] [T: 1:27:36] [ETA: 1:47:05] [loss: 5.723] [tokens/s: 177321.705] [batches/s: 0.085] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-28 03:16:04][logger:171][INFO] [step: 943718400] [train_eval/train_token_count: 943718400] [train_eval/train_batch_count: 450] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 5256.981] [train_eval/train_update_time: 3040.416] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 5.752] [train_eval/perplexity_len_2048: 314.970] [train_eval/loss_avg_len_1024: 5.769] [train_eval/perplexity_len_1024: 320.169] [train_eval/loss_avg_len_512: 5.795] [train_eval/perplexity_len_512: 328.749] +[2025-10-28 03:17:11][utils:57][INFO] [P: 46.00%] [S: 964689920/2097152000] [T: 1:28:44] [ETA: 1:44:10] [loss: 5.698] [tokens/s: 193963.049] [batches/s: 0.092] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-28 03:17:11][train:194][INFO] Running validation... +[2025-10-28 03:18:51][logger:171][INFO] [step: 964689920] [val/train_token_count: 964689920] [val/train_batch_count: 460] [val/train_flop_count: 0] [val/train_total_time: 5324.629] [val/train_update_time: 3107.943] [val/loss: 5.716] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 99.807] [val/val_tokens_per_second: 410393.901] [val/loss_avg_len_2048: 5.716] [val/perplexity_len_2048: 303.782] [val/loss_avg_len_1024: 5.735] [val/perplexity_len_1024: 309.495] [val/loss_avg_len_512: 5.763] [val/perplexity_len_512: 318.364] +[2025-10-28 03:19:59][utils:57][INFO] [P: 47.00%] [S: 985661440/2097152000] [T: 1:31:32] [ETA: 1:43:13] [loss: 5.713] [tokens/s: 177415.998] [batches/s: 0.085] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-28 03:21:06][utils:57][INFO] [P: 48.00%] [S: 1006632960/2097152000] [T: 1:32:39] [ETA: 1:40:23] [loss: 5.699] [tokens/s: 194019.719] [batches/s: 0.093] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-28 03:21:06][train:194][INFO] Running validation... +[2025-10-28 03:22:47][logger:171][INFO] [step: 1006632960] [val/train_token_count: 1006632960] [val/train_batch_count: 480] [val/train_flop_count: 0] [val/train_total_time: 5559.744] [val/train_update_time: 3242.999] [val/loss: 5.701] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 100.360] [val/val_tokens_per_second: 408130.422] [val/loss_avg_len_2048: 5.701] [val/perplexity_len_2048: 299.093] [val/loss_avg_len_1024: 5.720] [val/perplexity_len_1024: 304.853] [val/loss_avg_len_512: 5.749] [val/perplexity_len_512: 313.834] +[2025-10-28 03:23:54][utils:57][INFO] [P: 49.00%] [S: 1027604480/2097152000] [T: 1:35:27] [ETA: 1:39:21] [loss: 5.691] [tokens/s: 177382.867] [batches/s: 0.085] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-28 03:25:02][utils:57][INFO] [P: 50.00%] [S: 1048576000/2097152000] [T: 1:36:35] [ETA: 1:36:35] [loss: 5.680] [tokens/s: 194044.503] [batches/s: 0.093] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-28 03:25:02][logger:171][INFO] [step: 1048576000] [train_eval/train_token_count: 1048576000] [train_eval/train_batch_count: 500] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 5795.387] [train_eval/train_update_time: 3378.042] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 5.703] [train_eval/perplexity_len_2048: 299.686] [train_eval/loss_avg_len_1024: 5.718] [train_eval/perplexity_len_1024: 304.318] [train_eval/loss_avg_len_512: 5.746] [train_eval/perplexity_len_512: 312.890] +[2025-10-28 03:25:02][train:194][INFO] Running validation... +[2025-10-28 03:26:42][logger:171][INFO] [step: 1048576000] [val/train_token_count: 1048576000] [val/train_batch_count: 500] [val/train_flop_count: 0] [val/train_total_time: 5795.387] [val/train_update_time: 3378.042] [val/loss: 5.679] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 100.290] [val/val_tokens_per_second: 408416.518] [val/loss_avg_len_2048: 5.679] [val/perplexity_len_2048: 292.634] [val/loss_avg_len_1024: 5.698] [val/perplexity_len_1024: 298.414] [val/loss_avg_len_512: 5.728] [val/perplexity_len_512: 307.462] +[2025-10-28 03:26:42][checkpoint:111][INFO] Saving checkpoint to /workspace/forgetting-transformer/alibi_2_4_256/checkpoints/step-000001048576000.pt... +[2025-10-28 03:26:43][checkpoint:128][INFO] Checkpoint saved to /workspace/forgetting-transformer/alibi_2_4_256/checkpoints/step-000001048576000.pt. +[2025-10-28 03:26:43][logger:171][INFO] [step: 1048576000] [checkpoint/checkpoint_time: 0.435] +[2025-10-28 03:27:50][utils:57][INFO] [P: 51.00%] [S: 1069547520/2097152000] [T: 1:39:23] [ETA: 1:35:29] [loss: 5.678] [tokens/s: 177347.327] [batches/s: 0.085] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-28 03:28:58][utils:57][INFO] [P: 52.00%] [S: 1090519040/2097152000] [T: 1:40:31] [ETA: 1:32:47] [loss: 5.672] [tokens/s: 193874.283] [batches/s: 0.092] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-28 03:28:58][train:194][INFO] Running validation... +[2025-10-28 03:30:38][logger:171][INFO] [step: 1090519040] [val/train_token_count: 1090519040] [val/train_batch_count: 520] [val/train_flop_count: 0] [val/train_total_time: 6031.410] [val/train_update_time: 3513.095] [val/loss: 5.663] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 99.906] [val/val_tokens_per_second: 409985.253] [val/loss_avg_len_2048: 5.663] [val/perplexity_len_2048: 287.938] [val/loss_avg_len_1024: 5.683] [val/perplexity_len_1024: 293.773] [val/loss_avg_len_512: 5.713] [val/perplexity_len_512: 302.903] +[2025-10-28 03:31:46][utils:57][INFO] [P: 53.00%] [S: 1111490560/2097152000] [T: 1:43:18] [ETA: 1:31:37] [loss: 5.642] [tokens/s: 177326.588] [batches/s: 0.085] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-28 03:32:53][utils:57][INFO] [P: 54.00%] [S: 1132462080/2097152000] [T: 1:44:26] [ETA: 1:28:58] [loss: 5.654] [tokens/s: 193938.037] [batches/s: 0.092] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-28 03:32:53][train:194][INFO] Running validation... +[2025-10-28 03:34:33][logger:171][INFO] [step: 1132462080] [val/train_token_count: 1132462080] [val/train_batch_count: 540] [val/train_flop_count: 0] [val/train_total_time: 6266.654] [val/train_update_time: 3648.162] [val/loss: 5.651] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 99.816] [val/val_tokens_per_second: 410355.068] [val/loss_avg_len_2048: 5.651] [val/perplexity_len_2048: 284.515] [val/loss_avg_len_1024: 5.671] [val/perplexity_len_1024: 290.340] [val/loss_avg_len_512: 5.702] [val/perplexity_len_512: 299.459] +[2025-10-28 03:35:41][utils:57][INFO] [P: 55.00%] [S: 1153433600/2097152000] [T: 1:47:14] [ETA: 1:27:44] [loss: 5.601] [tokens/s: 177398.714] [batches/s: 0.085] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-28 03:35:41][logger:171][INFO] [step: 1153433600] [train_eval/train_token_count: 1153433600] [train_eval/train_batch_count: 550] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 6434.099] [train_eval/train_update_time: 3715.676] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 5.656] [train_eval/perplexity_len_2048: 285.901] [train_eval/loss_avg_len_1024: 5.670] [train_eval/perplexity_len_1024: 289.963] [train_eval/loss_avg_len_512: 5.696] [train_eval/perplexity_len_512: 297.761] +[2025-10-28 03:36:48][utils:57][INFO] [P: 56.00%] [S: 1174405120/2097152000] [T: 1:48:21] [ETA: 1:25:08] [loss: 5.638] [tokens/s: 193941.089] [batches/s: 0.092] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-28 03:36:48][train:194][INFO] Running validation... +[2025-10-28 03:38:29][logger:171][INFO] [step: 1174405120] [val/train_token_count: 1174405120] [val/train_batch_count: 560] [val/train_flop_count: 0] [val/train_total_time: 6501.730] [val/train_update_time: 3783.195] [val/loss: 5.633] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 100.261] [val/val_tokens_per_second: 408531.878] [val/loss_avg_len_2048: 5.633] [val/perplexity_len_2048: 279.424] [val/loss_avg_len_1024: 5.653] [val/perplexity_len_1024: 285.247] [val/loss_avg_len_512: 5.685] [val/perplexity_len_512: 294.396] +[2025-10-28 03:39:36][utils:57][INFO] [P: 57.00%] [S: 1195376640/2097152000] [T: 1:51:09] [ETA: 1:23:51] [loss: 5.594] [tokens/s: 177336.460] [batches/s: 0.085] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-28 03:40:44][utils:57][INFO] [P: 58.00%] [S: 1216348160/2097152000] [T: 1:52:17] [ETA: 1:21:18] [loss: 5.651] [tokens/s: 193966.303] [batches/s: 0.092] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-28 03:40:44][train:194][INFO] Running validation... +[2025-10-28 03:42:24][logger:171][INFO] [step: 1216348160] [val/train_token_count: 1216348160] [val/train_batch_count: 580] [val/train_flop_count: 0] [val/train_total_time: 6737.261] [val/train_update_time: 3918.230] [val/loss: 5.620] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 100.222] [val/val_tokens_per_second: 408691.590] [val/loss_avg_len_2048: 5.620] [val/perplexity_len_2048: 275.776] [val/loss_avg_len_1024: 5.641] [val/perplexity_len_1024: 281.647] [val/loss_avg_len_512: 5.673] [val/perplexity_len_512: 290.917] +[2025-10-28 03:43:32][utils:57][INFO] [P: 59.00%] [S: 1237319680/2097152000] [T: 1:55:05] [ETA: 1:19:58] [loss: 5.652] [tokens/s: 177362.441] [batches/s: 0.085] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-28 03:44:39][utils:57][INFO] [P: 60.00%] [S: 1258291200/2097152000] [T: 1:56:12] [ETA: 1:17:28] [loss: 5.643] [tokens/s: 194070.780] [batches/s: 0.093] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-28 03:44:39][logger:171][INFO] [step: 1258291200] [train_eval/train_token_count: 1258291200] [train_eval/train_batch_count: 600] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 6972.699] [train_eval/train_update_time: 4053.228] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 5.618] [train_eval/perplexity_len_2048: 275.425] [train_eval/loss_avg_len_1024: 5.633] [train_eval/perplexity_len_1024: 279.583] [train_eval/loss_avg_len_512: 5.662] [train_eval/perplexity_len_512: 287.863] +[2025-10-28 03:44:39][train:194][INFO] Running validation... +[2025-10-28 03:46:20][logger:171][INFO] [step: 1258291200] [val/train_token_count: 1258291200] [val/train_batch_count: 600] [val/train_flop_count: 0] [val/train_total_time: 6972.699] [val/train_update_time: 4053.228] [val/loss: 5.607] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 100.519] [val/val_tokens_per_second: 407485.827] [val/loss_avg_len_2048: 5.607] [val/perplexity_len_2048: 272.312] [val/loss_avg_len_1024: 5.628] [val/perplexity_len_1024: 278.191] [val/loss_avg_len_512: 5.661] [val/perplexity_len_512: 287.474] +[2025-10-28 03:46:20][checkpoint:111][INFO] Saving checkpoint to /workspace/forgetting-transformer/alibi_2_4_256/checkpoints/step-000001258291200.pt... +[2025-10-28 03:46:20][checkpoint:128][INFO] Checkpoint saved to /workspace/forgetting-transformer/alibi_2_4_256/checkpoints/step-000001258291200.pt. +[2025-10-28 03:46:20][logger:171][INFO] [step: 1258291200] [checkpoint/checkpoint_time: 0.547] +[2025-10-28 03:47:28][utils:57][INFO] [P: 61.00%] [S: 1279262720/2097152000] [T: 1:59:01] [ETA: 1:16:05] [loss: 5.618] [tokens/s: 177320.138] [batches/s: 0.085] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-28 03:48:36][utils:57][INFO] [P: 62.00%] [S: 1300234240/2097152000] [T: 2:00:08] [ETA: 1:13:38] [loss: 5.589] [tokens/s: 193873.657] [batches/s: 0.092] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-28 03:48:36][train:194][INFO] Running validation... +[2025-10-28 03:50:16][logger:171][INFO] [step: 1300234240] [val/train_token_count: 1300234240] [val/train_batch_count: 620] [val/train_flop_count: 0] [val/train_total_time: 7208.983] [val/train_update_time: 4188.229] [val/loss: 5.596] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 100.085] [val/val_tokens_per_second: 409252.574] [val/loss_avg_len_2048: 5.596] [val/perplexity_len_2048: 269.274] [val/loss_avg_len_1024: 5.617] [val/perplexity_len_1024: 275.188] [val/loss_avg_len_512: 5.651] [val/perplexity_len_512: 284.560] +[2025-10-28 03:51:23][utils:57][INFO] [P: 63.00%] [S: 1321205760/2097152000] [T: 2:02:56] [ETA: 1:12:12] [loss: 5.613] [tokens/s: 177308.237] [batches/s: 0.085] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-28 03:52:31][utils:57][INFO] [P: 64.00%] [S: 1342177280/2097152000] [T: 2:04:04] [ETA: 1:09:47] [loss: 5.598] [tokens/s: 193846.873] [batches/s: 0.092] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-28 03:52:31][train:194][INFO] Running validation... +[2025-10-28 03:54:11][logger:171][INFO] [step: 1342177280] [val/train_token_count: 1342177280] [val/train_batch_count: 640] [val/train_flop_count: 0] [val/train_total_time: 7444.285] [val/train_update_time: 4323.227] [val/loss: 5.585] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 100.351] [val/val_tokens_per_second: 408168.991] [val/loss_avg_len_2048: 5.585] [val/perplexity_len_2048: 266.352] [val/loss_avg_len_1024: 5.607] [val/perplexity_len_1024: 272.291] [val/loss_avg_len_512: 5.641] [val/perplexity_len_512: 281.717] +[2025-10-28 03:55:19][utils:57][INFO] [P: 65.00%] [S: 1363148800/2097152000] [T: 2:06:52] [ETA: 1:08:18] [loss: 5.568] [tokens/s: 177245.482] [batches/s: 0.085] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-28 03:55:19][logger:171][INFO] [step: 1363148800] [train_eval/train_token_count: 1363148800] [train_eval/train_batch_count: 650] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 7612.232] [train_eval/train_update_time: 4390.719] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 5.597] [train_eval/perplexity_len_2048: 269.491] [train_eval/loss_avg_len_1024: 5.617] [train_eval/perplexity_len_1024: 274.991] [train_eval/loss_avg_len_512: 5.647] [train_eval/perplexity_len_512: 283.339] +[2025-10-28 03:56:26][utils:57][INFO] [P: 66.00%] [S: 1384120320/2097152000] [T: 2:07:59] [ETA: 1:05:56] [loss: 5.591] [tokens/s: 193842.688] [batches/s: 0.092] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-28 03:56:26][train:194][INFO] Running validation... +[2025-10-28 03:58:06][logger:171][INFO] [step: 1384120320] [val/train_token_count: 1384120320] [val/train_batch_count: 660] [val/train_flop_count: 0] [val/train_total_time: 7679.832] [val/train_update_time: 4458.219] [val/loss: 5.575] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 100.008] [val/val_tokens_per_second: 409566.848] [val/loss_avg_len_2048: 5.575] [val/perplexity_len_2048: 263.717] [val/loss_avg_len_1024: 5.597] [val/perplexity_len_1024: 269.670] [val/loss_avg_len_512: 5.632] [val/perplexity_len_512: 279.145] +[2025-10-28 03:59:14][utils:57][INFO] [P: 67.00%] [S: 1405091840/2097152000] [T: 2:10:47] [ETA: 1:04:25] [loss: 5.548] [tokens/s: 177291.556] [batches/s: 0.085] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-28 04:00:22][utils:57][INFO] [P: 68.00%] [S: 1426063360/2097152000] [T: 2:11:55] [ETA: 1:02:04] [loss: 5.566] [tokens/s: 193892.423] [batches/s: 0.092] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-28 04:00:22][train:194][INFO] Running validation... +[2025-10-28 04:02:02][logger:171][INFO] [step: 1426063360] [val/train_token_count: 1426063360] [val/train_batch_count: 680] [val/train_flop_count: 0] [val/train_total_time: 7915.051] [val/train_update_time: 4593.218] [val/loss: 5.566] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 100.562] [val/val_tokens_per_second: 407309.717] [val/loss_avg_len_2048: 5.566] [val/perplexity_len_2048: 261.397] [val/loss_avg_len_1024: 5.589] [val/perplexity_len_1024: 267.359] [val/loss_avg_len_512: 5.624] [val/perplexity_len_512: 276.891] +[2025-10-28 04:03:10][utils:57][INFO] [P: 69.00%] [S: 1447034880/2097152000] [T: 2:14:43] [ETA: 1:00:31] [loss: 5.576] [tokens/s: 177245.079] [batches/s: 0.085] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-28 04:04:17][utils:57][INFO] [P: 70.00%] [S: 1468006400/2097152000] [T: 2:15:50] [ETA: 0:58:13] [loss: 5.572] [tokens/s: 193982.886] [batches/s: 0.092] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-28 04:04:17][logger:171][INFO] [step: 1468006400] [train_eval/train_token_count: 1468006400] [train_eval/train_batch_count: 700] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 8150.835] [train_eval/train_update_time: 4728.230] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 5.567] [train_eval/perplexity_len_2048: 261.654] [train_eval/loss_avg_len_1024: 5.587] [train_eval/perplexity_len_1024: 266.970] [train_eval/loss_avg_len_512: 5.621] [train_eval/perplexity_len_512: 276.270] +[2025-10-28 04:04:17][train:194][INFO] Running validation... +[2025-10-28 04:05:57][logger:171][INFO] [step: 1468006400] [val/train_token_count: 1468006400] [val/train_batch_count: 700] [val/train_flop_count: 0] [val/train_total_time: 8150.835] [val/train_update_time: 4728.230] [val/loss: 5.558] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 99.887] [val/val_tokens_per_second: 410062.622] [val/loss_avg_len_2048: 5.558] [val/perplexity_len_2048: 259.375] [val/loss_avg_len_1024: 5.581] [val/perplexity_len_1024: 265.356] [val/loss_avg_len_512: 5.616] [val/perplexity_len_512: 274.916] +[2025-10-28 04:05:57][checkpoint:111][INFO] Saving checkpoint to /workspace/forgetting-transformer/alibi_2_4_256/checkpoints/step-000001468006400.pt... +[2025-10-28 04:05:58][checkpoint:128][INFO] Checkpoint saved to /workspace/forgetting-transformer/alibi_2_4_256/checkpoints/step-000001468006400.pt. +[2025-10-28 04:05:58][logger:171][INFO] [step: 1468006400] [checkpoint/checkpoint_time: 0.542] +[2025-10-28 04:07:06][utils:57][INFO] [P: 71.00%] [S: 1488977920/2097152000] [T: 2:18:38] [ETA: 0:56:37] [loss: 5.558] [tokens/s: 177340.233] [batches/s: 0.085] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-28 04:08:13][utils:57][INFO] [P: 72.00%] [S: 1509949440/2097152000] [T: 2:19:46] [ETA: 0:54:21] [loss: 5.551] [tokens/s: 193920.123] [batches/s: 0.092] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-28 04:08:13][train:194][INFO] Running validation... +[2025-10-28 04:09:53][logger:171][INFO] [step: 1509949440] [val/train_token_count: 1509949440] [val/train_batch_count: 720] [val/train_flop_count: 0] [val/train_total_time: 8386.480] [val/train_update_time: 4863.236] [val/loss: 5.550] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 100.086] [val/val_tokens_per_second: 409249.749] [val/loss_avg_len_2048: 5.550] [val/perplexity_len_2048: 257.289] [val/loss_avg_len_1024: 5.573] [val/perplexity_len_1024: 263.280] [val/loss_avg_len_512: 5.609] [val/perplexity_len_512: 272.875] +[2025-10-28 04:11:01][utils:57][INFO] [P: 73.00%] [S: 1530920960/2097152000] [T: 2:22:34] [ETA: 0:52:43] [loss: 5.573] [tokens/s: 177341.769] [batches/s: 0.085] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-28 04:12:08][utils:57][INFO] [P: 74.00%] [S: 1551892480/2097152000] [T: 2:23:41] [ETA: 0:50:29] [loss: 5.564] [tokens/s: 193966.117] [batches/s: 0.092] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-28 04:12:08][train:194][INFO] Running validation... +[2025-10-28 04:13:49][logger:171][INFO] [step: 1551892480] [val/train_token_count: 1551892480] [val/train_batch_count: 740] [val/train_flop_count: 0] [val/train_total_time: 8621.792] [val/train_update_time: 4998.251] [val/loss: 5.543] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 100.185] [val/val_tokens_per_second: 408845.489] [val/loss_avg_len_2048: 5.543] [val/perplexity_len_2048: 255.557] [val/loss_avg_len_1024: 5.567] [val/perplexity_len_1024: 261.566] [val/loss_avg_len_512: 5.603] [val/perplexity_len_512: 271.196] +[2025-10-28 04:14:56][utils:57][INFO] [P: 75.00%] [S: 1572864000/2097152000] [T: 2:26:29] [ETA: 0:48:49] [loss: 5.561] [tokens/s: 177362.861] [batches/s: 0.085] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-28 04:14:56][logger:171][INFO] [step: 1572864000] [train_eval/train_token_count: 1572864000] [train_eval/train_batch_count: 750] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 8789.584] [train_eval/train_update_time: 5065.749] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 5.549] [train_eval/perplexity_len_2048: 256.889] [train_eval/loss_avg_len_1024: 5.570] [train_eval/perplexity_len_1024: 262.327] [train_eval/loss_avg_len_512: 5.604] [train_eval/perplexity_len_512: 271.396] +[2025-10-28 04:16:04][utils:57][INFO] [P: 76.00%] [S: 1593835520/2097152000] [T: 2:27:37] [ETA: 0:46:37] [loss: 5.501] [tokens/s: 193929.065] [batches/s: 0.092] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-28 04:16:04][train:194][INFO] Running validation... +[2025-10-28 04:17:44][logger:171][INFO] [step: 1593835520] [val/train_token_count: 1593835520] [val/train_batch_count: 760] [val/train_flop_count: 0] [val/train_total_time: 8857.200] [val/train_update_time: 5133.254] [val/loss: 5.538] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 100.439] [val/val_tokens_per_second: 407810.770] [val/loss_avg_len_2048: 5.538] [val/perplexity_len_2048: 254.103] [val/loss_avg_len_1024: 5.561] [val/perplexity_len_1024: 260.106] [val/loss_avg_len_512: 5.598] [val/perplexity_len_512: 269.763] +[2025-10-28 04:18:52][utils:57][INFO] [P: 77.00%] [S: 1614807040/2097152000] [T: 2:30:25] [ETA: 0:44:55] [loss: 5.579] [tokens/s: 177295.539] [batches/s: 0.085] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-28 04:19:59][utils:57][INFO] [P: 78.00%] [S: 1635778560/2097152000] [T: 2:31:32] [ETA: 0:42:44] [loss: 5.490] [tokens/s: 193947.109] [batches/s: 0.092] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-28 04:19:59][train:194][INFO] Running validation... +[2025-10-28 04:21:40][logger:171][INFO] [step: 1635778560] [val/train_token_count: 1635778560] [val/train_batch_count: 780] [val/train_flop_count: 0] [val/train_total_time: 9092.876] [val/train_update_time: 5268.263] [val/loss: 5.532] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 100.486] [val/val_tokens_per_second: 407619.115] [val/loss_avg_len_2048: 5.532] [val/perplexity_len_2048: 252.762] [val/loss_avg_len_1024: 5.556] [val/perplexity_len_1024: 258.793] [val/loss_avg_len_512: 5.593] [val/perplexity_len_512: 268.502] +[2025-10-28 04:22:48][utils:57][INFO] [P: 79.00%] [S: 1656750080/2097152000] [T: 2:34:20] [ETA: 0:41:01] [loss: 5.528] [tokens/s: 177302.413] [batches/s: 0.085] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-28 04:23:55][utils:57][INFO] [P: 80.00%] [S: 1677721600/2097152000] [T: 2:35:28] [ETA: 0:38:52] [loss: 5.501] [tokens/s: 193938.822] [batches/s: 0.092] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-28 04:23:55][logger:171][INFO] [step: 1677721600] [train_eval/train_token_count: 1677721600] [train_eval/train_batch_count: 800] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 9328.580] [train_eval/train_update_time: 5403.267] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 5.534] [train_eval/perplexity_len_2048: 253.146] [train_eval/loss_avg_len_1024: 5.557] [train_eval/perplexity_len_1024: 259.015] [train_eval/loss_avg_len_512: 5.591] [train_eval/perplexity_len_512: 267.937] +[2025-10-28 04:23:55][train:194][INFO] Running validation... +[2025-10-28 04:25:36][logger:171][INFO] [step: 1677721600] [val/train_token_count: 1677721600] [val/train_batch_count: 800] [val/train_flop_count: 0] [val/train_total_time: 9328.580] [val/train_update_time: 5403.267] [val/loss: 5.528] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 100.730] [val/val_tokens_per_second: 406631.809] [val/loss_avg_len_2048: 5.528] [val/perplexity_len_2048: 251.598] [val/loss_avg_len_1024: 5.552] [val/perplexity_len_1024: 257.636] [val/loss_avg_len_512: 5.589] [val/perplexity_len_512: 267.351] +[2025-10-28 04:25:36][checkpoint:111][INFO] Saving checkpoint to /workspace/forgetting-transformer/alibi_2_4_256/checkpoints/step-000001677721600.pt... +[2025-10-28 04:25:36][checkpoint:128][INFO] Checkpoint saved to /workspace/forgetting-transformer/alibi_2_4_256/checkpoints/step-000001677721600.pt. +[2025-10-28 04:25:36][logger:171][INFO] [step: 1677721600] [checkpoint/checkpoint_time: 0.541] +[2025-10-28 04:26:44][utils:57][INFO] [P: 81.00%] [S: 1698693120/2097152000] [T: 2:38:17] [ETA: 0:37:07] [loss: 5.472] [tokens/s: 177175.592] [batches/s: 0.084] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-28 04:27:52][utils:57][INFO] [P: 82.00%] [S: 1719664640/2097152000] [T: 2:39:25] [ETA: 0:34:59] [loss: 5.507] [tokens/s: 193722.586] [batches/s: 0.092] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-28 04:27:52][train:194][INFO] Running validation... +[2025-10-28 04:29:32][logger:171][INFO] [step: 1719664640] [val/train_token_count: 1719664640] [val/train_batch_count: 820] [val/train_flop_count: 0] [val/train_total_time: 9565.069] [val/train_update_time: 5538.269] [val/loss: 5.524] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 100.263] [val/val_tokens_per_second: 408527.292] [val/loss_avg_len_2048: 5.524] [val/perplexity_len_2048: 250.618] [val/loss_avg_len_1024: 5.548] [val/perplexity_len_1024: 256.657] [val/loss_avg_len_512: 5.585] [val/perplexity_len_512: 266.386] +[2025-10-28 04:30:40][utils:57][INFO] [P: 83.00%] [S: 1740636160/2097152000] [T: 2:42:12] [ETA: 0:33:13] [loss: 5.533] [tokens/s: 177147.677] [batches/s: 0.084] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-28 04:31:47][utils:57][INFO] [P: 84.00%] [S: 1761607680/2097152000] [T: 2:43:20] [ETA: 0:31:06] [loss: 5.483] [tokens/s: 193706.192] [batches/s: 0.092] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-28 04:31:47][train:194][INFO] Running validation... +[2025-10-28 04:33:28][logger:171][INFO] [step: 1761607680] [val/train_token_count: 1761607680] [val/train_batch_count: 840] [val/train_flop_count: 0] [val/train_total_time: 9800.571] [val/train_update_time: 5673.296] [val/loss: 5.521] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 100.787] [val/val_tokens_per_second: 406402.771] [val/loss_avg_len_2048: 5.521] [val/perplexity_len_2048: 249.779] [val/loss_avg_len_1024: 5.544] [val/perplexity_len_1024: 255.825] [val/loss_avg_len_512: 5.582] [val/perplexity_len_512: 265.578] +[2025-10-28 04:34:36][utils:57][INFO] [P: 85.00%] [S: 1782579200/2097152000] [T: 2:46:08] [ETA: 0:29:19] [loss: 5.566] [tokens/s: 177052.208] [batches/s: 0.084] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-28 04:34:36][logger:171][INFO] [step: 1782579200] [train_eval/train_token_count: 1782579200] [train_eval/train_batch_count: 850] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 9968.994] [train_eval/train_update_time: 5740.817] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 5.516] [train_eval/perplexity_len_2048: 248.681] [train_eval/loss_avg_len_1024: 5.533] [train_eval/perplexity_len_1024: 252.830] [train_eval/loss_avg_len_512: 5.568] [train_eval/perplexity_len_512: 261.814] +[2025-10-28 04:35:43][utils:57][INFO] [P: 86.00%] [S: 1803550720/2097152000] [T: 2:47:16] [ETA: 0:27:13] [loss: 5.534] [tokens/s: 193639.040] [batches/s: 0.092] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-28 04:35:43][train:194][INFO] Running validation... +[2025-10-28 04:37:23][logger:171][INFO] [step: 1803550720] [val/train_token_count: 1803550720] [val/train_batch_count: 860] [val/train_flop_count: 0] [val/train_total_time: 10036.600] [val/train_update_time: 5808.321] [val/loss: 5.518] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 100.266] [val/val_tokens_per_second: 408513.536] [val/loss_avg_len_2048: 5.518] [val/perplexity_len_2048: 249.091] [val/loss_avg_len_1024: 5.542] [val/perplexity_len_1024: 255.147] [val/loss_avg_len_512: 5.579] [val/perplexity_len_512: 264.910] +[2025-10-28 04:38:31][utils:57][INFO] [P: 87.00%] [S: 1824522240/2097152000] [T: 2:50:04] [ETA: 0:25:24] [loss: 5.472] [tokens/s: 177078.559] [batches/s: 0.084] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-28 04:39:39][utils:57][INFO] [P: 88.00%] [S: 1845493760/2097152000] [T: 2:51:12] [ETA: 0:23:20] [loss: 5.504] [tokens/s: 193683.764] [batches/s: 0.092] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-28 04:39:39][train:194][INFO] Running validation... +[2025-10-28 04:41:19][logger:171][INFO] [step: 1845493760] [val/train_token_count: 1845493760] [val/train_batch_count: 880] [val/train_flop_count: 0] [val/train_total_time: 10272.080] [val/train_update_time: 5943.330] [val/loss: 5.516] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 100.354] [val/val_tokens_per_second: 408153.153] [val/loss_avg_len_2048: 5.516] [val/perplexity_len_2048: 248.572] [val/loss_avg_len_1024: 5.540] [val/perplexity_len_1024: 254.629] [val/loss_avg_len_512: 5.577] [val/perplexity_len_512: 264.398] +[2025-10-28 04:42:27][utils:57][INFO] [P: 89.00%] [S: 1866465280/2097152000] [T: 2:54:00] [ETA: 0:21:30] [loss: 5.567] [tokens/s: 177102.242] [batches/s: 0.084] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-28 04:43:34][utils:57][INFO] [P: 90.00%] [S: 1887436800/2097152000] [T: 2:55:07] [ETA: 0:19:27] [loss: 5.464] [tokens/s: 193850.283] [batches/s: 0.092] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-28 04:43:34][logger:171][INFO] [step: 1887436800] [train_eval/train_token_count: 1887436800] [train_eval/train_batch_count: 900] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 10507.650] [train_eval/train_update_time: 6078.328] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 5.516] [train_eval/perplexity_len_2048: 248.575] [train_eval/loss_avg_len_1024: 5.536] [train_eval/perplexity_len_1024: 253.742] [train_eval/loss_avg_len_512: 5.572] [train_eval/perplexity_len_512: 262.860] +[2025-10-28 04:43:34][train:194][INFO] Running validation... +[2025-10-28 04:45:15][logger:171][INFO] [step: 1887436800] [val/train_token_count: 1887436800] [val/train_batch_count: 900] [val/train_flop_count: 0] [val/train_total_time: 10507.650] [val/train_update_time: 6078.328] [val/loss: 5.514] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 100.790] [val/val_tokens_per_second: 406389.591] [val/loss_avg_len_2048: 5.514] [val/perplexity_len_2048: 248.160] [val/loss_avg_len_1024: 5.538] [val/perplexity_len_1024: 254.220] [val/loss_avg_len_512: 5.576] [val/perplexity_len_512: 264.001] +[2025-10-28 04:45:15][checkpoint:111][INFO] Saving checkpoint to /workspace/forgetting-transformer/alibi_2_4_256/checkpoints/step-000001887436800.pt... +[2025-10-28 04:45:16][checkpoint:128][INFO] Checkpoint saved to /workspace/forgetting-transformer/alibi_2_4_256/checkpoints/step-000001887436800.pt. +[2025-10-28 04:45:16][logger:171][INFO] [step: 1887436800] [checkpoint/checkpoint_time: 0.573] +[2025-10-28 04:46:55][utils:57][INFO] [P: 91.00%] [S: 1908408320/2097152000] [T: 2:58:27] [ETA: 0:17:39] [loss: 5.510] [tokens/s: 172480.392] [batches/s: 0.082] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-28 04:50:06][utils:57][INFO] [P: 92.00%] [S: 1929379840/2097152000] [T: 3:01:39] [ETA: 0:15:47] [loss: 5.545] [tokens/s: 169208.921] [batches/s: 0.081] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-28 04:50:06][train:194][INFO] Running validation... +[2025-10-28 04:54:23][logger:171][INFO] [step: 1929379840] [val/train_token_count: 1929379840] [val/train_batch_count: 920] [val/train_flop_count: 0] [val/train_total_time: 10899.097] [val/train_update_time: 6367.991] [val/loss: 5.513] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 256.654] [val/val_tokens_per_second: 159592.450] [val/loss_avg_len_2048: 5.513] [val/perplexity_len_2048: 247.864] [val/loss_avg_len_1024: 5.537] [val/perplexity_len_1024: 253.923] [val/loss_avg_len_512: 5.575] [val/perplexity_len_512: 263.702] +[2025-10-28 04:56:33][utils:57][INFO] [P: 93.00%] [S: 1950351360/2097152000] [T: 3:08:05] [ETA: 0:14:09] [loss: 5.524] [tokens/s: 134270.643] [batches/s: 0.064] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-28 04:59:38][utils:57][INFO] [P: 94.00%] [S: 1971322880/2097152000] [T: 3:11:11] [ETA: 0:12:12] [loss: 5.470] [tokens/s: 132805.321] [batches/s: 0.063] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-28 04:59:38][train:194][INFO] Running validation... +[2025-10-28 05:04:08][logger:171][INFO] [step: 1971322880] [val/train_token_count: 1971322880] [val/train_batch_count: 940] [val/train_flop_count: 0] [val/train_total_time: 11471.464] [val/train_update_time: 6683.227] [val/loss: 5.512] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 269.618] [val/val_tokens_per_second: 151918.872] [val/loss_avg_len_2048: 5.512] [val/perplexity_len_2048: 247.668] [val/loss_avg_len_1024: 5.536] [val/perplexity_len_1024: 253.726] [val/loss_avg_len_512: 5.574] [val/perplexity_len_512: 263.505] +[2025-10-28 05:06:18][utils:57][INFO] [P: 95.00%] [S: 1992294400/2097152000] [T: 3:17:51] [ETA: 0:10:24] [loss: 5.509] [tokens/s: 109514.305] [batches/s: 0.052] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-28 05:06:18][logger:171][INFO] [step: 1992294400] [train_eval/train_token_count: 1992294400] [train_eval/train_batch_count: 950] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 11871.571] [train_eval/train_update_time: 6813.441] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 5.506] [train_eval/perplexity_len_2048: 246.222] [train_eval/loss_avg_len_1024: 5.530] [train_eval/perplexity_len_1024: 252.035] [train_eval/loss_avg_len_512: 5.564] [train_eval/perplexity_len_512: 260.951] +[2025-10-28 05:09:01][utils:57][INFO] [P: 96.00%] [S: 2013265920/2097152000] [T: 3:20:34] [ETA: 0:08:21] [loss: 5.492] [tokens/s: 109797.220] [batches/s: 0.052] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-28 05:09:01][train:194][INFO] Running validation... +[2025-10-28 05:13:46][logger:171][INFO] [step: 2013265920] [val/train_token_count: 2013265920] [val/train_batch_count: 960] [val/train_flop_count: 0] [val/train_total_time: 12034.565] [val/train_update_time: 6976.259] [val/loss: 5.512] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 284.595] [val/val_tokens_per_second: 143923.791] [val/loss_avg_len_2048: 5.512] [val/perplexity_len_2048: 247.558] [val/loss_avg_len_1024: 5.536] [val/perplexity_len_1024: 253.618] [val/loss_avg_len_512: 5.574] [val/perplexity_len_512: 263.405] +[2025-10-28 05:15:56][utils:57][INFO] [P: 97.00%] [S: 2034237440/2097152000] [T: 3:27:29] [ETA: 0:06:25] [loss: 5.528] [tokens/s: 92744.415] [batches/s: 0.044] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-28 05:18:13][utils:57][INFO] [P: 98.00%] [S: 2055208960/2097152000] [T: 3:29:46] [ETA: 0:04:16] [loss: 5.510] [tokens/s: 94077.610] [batches/s: 0.045] [MFU: 0.000] [TFLOPS: 0.000] +[2025-10-28 05:18:13][train:194][INFO] Running validation... +[2025-10-28 05:23:14][logger:171][INFO] [step: 2055208960] [val/train_token_count: 2055208960] [val/train_batch_count: 980] [val/train_flop_count: 0] [val/train_total_time: 12586.090] [val/train_update_time: 7242.766] [val/loss: 5.511] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 301.208] [val/val_tokens_per_second: 135985.772] [val/loss_avg_len_2048: 5.511] [val/perplexity_len_2048: 247.507] [val/loss_avg_len_1024: 5.536] [val/perplexity_len_1024: 253.567] [val/loss_avg_len_512: 5.573] [val/perplexity_len_512: 263.350] +[2025-10-28 05:23:14][train:854][INFO] Training finished with 2055208960 tokens! diff --git a/metrics/jsonlines/checkpoint.jsonl b/metrics/jsonlines/checkpoint.jsonl index 6d4622c89891de42b5266e0a26ab9bfe9e3da90b..cc8bc905a40df7437d7fd0119058d41ed4349b6b 100644 --- a/metrics/jsonlines/checkpoint.jsonl +++ b/metrics/jsonlines/checkpoint.jsonl @@ -1,9 +1,9 @@ -{"step": 209715200, "checkpoint/checkpoint_time": 0.4586985750356689} -{"step": 419430400, "checkpoint/checkpoint_time": 0.452107127988711} -{"step": 629145600, "checkpoint/checkpoint_time": 0.45928702398668975} -{"step": 838860800, "checkpoint/checkpoint_time": 0.4599621079978533} -{"step": 1048576000, "checkpoint/checkpoint_time": 0.4533659809967503} -{"step": 1258291200, "checkpoint/checkpoint_time": 0.4462293910328299} -{"step": 1468006400, "checkpoint/checkpoint_time": 0.4543691629660316} -{"step": 1677721600, "checkpoint/checkpoint_time": 0.5127520990208723} -{"step": 1887436800, "checkpoint/checkpoint_time": 0.4454628659877926} +{"step": 209715200, "checkpoint/checkpoint_time": 0.4554534360067919} +{"step": 419430400, "checkpoint/checkpoint_time": 0.44249288097489625} +{"step": 629145600, "checkpoint/checkpoint_time": 0.42379301704932004} +{"step": 838860800, "checkpoint/checkpoint_time": 0.4248116289963946} +{"step": 1048576000, "checkpoint/checkpoint_time": 0.43460876401513815} +{"step": 1258291200, "checkpoint/checkpoint_time": 0.5474804400000721} +{"step": 1468006400, "checkpoint/checkpoint_time": 0.5417233019834384} +{"step": 1677721600, "checkpoint/checkpoint_time": 0.5410387290176004} +{"step": 1887436800, "checkpoint/checkpoint_time": 0.5728158770361915} diff --git a/metrics/jsonlines/norm.jsonl b/metrics/jsonlines/norm.jsonl index 10b2abe3834913c001231b43be3fb86d07059286..900aa1428bb9ad167cb4b9a6a657f75690423d67 100644 --- a/metrics/jsonlines/norm.jsonl +++ b/metrics/jsonlines/norm.jsonl @@ -1,98 +1,98 @@ -{"step": 20971520, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 71.96881103515625, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.04931763932108879, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 15.992537498474121, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.0013542725937440991, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 5.17136812210083, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.005113104358315468, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 5.1492719650268555, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.005361592397093773, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.148346424102783, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.0356970876455307, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.15713357925415, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.04859118163585663, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 15.987136840820312, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.0012968811206519604, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 12.615348815917969, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.042889636009931564, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 8.929548263549805, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.04455239698290825, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 16.001537322998047, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.0014995222445577383, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 5.1575236320495605, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.0024944604374468327, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 5.185313701629639, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.002509406069293618, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.166274547576904, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.07670743018388748, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.143224716186523, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.1674593985080719, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 16.005922317504883, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.002554647158831358, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 12.627293586730957, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.06328180432319641, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 8.90603256225586, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.07561032474040985, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 16.017423629760742, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.06265796720981598, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 72.25432586669922, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.9707998633384705} -{"step": 41943040, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 73.00873565673828, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.04540823772549629, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 15.996996879577637, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.001184858032502234, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 5.457752704620361, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.007311895955353975, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 5.446048259735107, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.004716766998171806, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.281280994415283, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.024227004498243332, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.282985687255859, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.028031563386321068, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 15.979215621948242, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.0007765464251860976, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 12.957906723022461, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.021230163052678108, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 9.175869941711426, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.022059369832277298, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 16.006433486938477, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.0005142671870999038, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 5.340754985809326, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.0021931882947683334, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 5.370672702789307, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.002267027273774147, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.303278923034668, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.01713803969323635, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.284801006317139, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.039099257439374924, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 15.98387336730957, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.0012423257576301694, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 12.904457092285156, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.021154584363102913, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 9.11008071899414, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.028935007750988007, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 16.15167236328125, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.0755498930811882, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 80.06193542480469, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.9559179544448853} -{"step": 62914560, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 74.06414031982422, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.14538046717643738, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 16.025676727294922, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.0064844065345823765, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 5.736937046051025, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.025005808100104332, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 5.785254955291748, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.014604698866605759, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.3871917724609375, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.1424376368522644, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.378758907318115, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.2028896063566208, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 15.988592147827148, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.012621867470443249, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 13.171307563781738, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.26772281527519226, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 9.337404251098633, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.24615249037742615, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 16.001752853393555, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.01234081294387579, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 5.451330184936523, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.01306989137083292, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 5.480719566345215, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.01254056766629219, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.373337268829346, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.16805213689804077, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.365626335144043, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.12980543076992035, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 15.949562072753906, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.0034623565152287483, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 13.050446510314941, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.09977266937494278, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 9.230801582336426, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.07658229768276215, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 16.297536849975586, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.017193246632814407, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 89.52496337890625, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.2164129763841629} -{"step": 83886080, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 75.31272888183594, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.16386568546295166, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 16.04182243347168, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.007264754269272089, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 5.847298622131348, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.03737557306885719, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 5.921523094177246, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.025941122323274612, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.46702766418457, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.15107837319374084, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.44968843460083, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.16013997793197632, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 16.002811431884766, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.00825895182788372, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 13.298969268798828, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.129194974899292, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 9.436746597290039, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.1324339658021927, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 15.990806579589844, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.006124014500528574, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 5.475744724273682, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.017109565436840057, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 5.4925689697265625, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.018278248608112335, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.39017391204834, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.12161125242710114, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.391374588012695, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.09431824833154678, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 15.919949531555176, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.004338539205491543, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 13.129576683044434, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.08807579427957535, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 9.288630485534668, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.05840325728058815, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 16.415802001953125, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.02059660479426384, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 96.92658996582031, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.17372927069664001} -{"step": 104857600, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 76.7603759765625, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.11061643064022064, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 16.05310821533203, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.004891774617135525, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 5.933614730834961, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.02395368367433548, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 6.015559673309326, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.018544230610132217, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.532172203063965, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.10083300620317459, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.510876178741455, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.1025606170296669, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 16.010618209838867, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.003967324271798134, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 13.401419639587402, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.07154672592878342, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 9.518426895141602, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.07494894415140152, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 15.98331069946289, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.003211443545296788, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 5.494350433349609, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.011250595562160015, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 5.504055500030518, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.009306743741035461, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.402806758880615, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.06876182556152344, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.4115095138549805, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.049699556082487106, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 15.89090633392334, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.0020288326777517796, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 13.180597305297852, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.052380070090293884, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 9.3232421875, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.03268451243638992, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 16.526533126831055, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.02623419277369976, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 102.65614318847656, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.12275241315364838} -{"step": 125829120, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 78.26237487792969, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.13649336993694305, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 16.06202507019043, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.005410207901149988, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 6.0194196701049805, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.02573239989578724, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 6.107144355773926, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.023679643869400024, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.577592372894287, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.12094458192586899, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.552911281585693, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.1344512552022934, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 16.018556594848633, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.004314595367759466, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 13.501096725463867, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.10377024114131927, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 9.59815502166748, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.08901906758546829, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 15.972997665405273, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.003723812522366643, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 5.506057262420654, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.010270838625729084, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 5.511464595794678, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.014140572398900986, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.403275012969971, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.07901971787214279, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.417155742645264, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.05525336042046547, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 15.878167152404785, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.0021998484153300524, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 13.238990783691406, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.05767178535461426, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 9.365641593933105, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.03323224186897278, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 16.641820907592773, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.027351388707756996, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 107.4212417602539, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.1167360469698906} -{"step": 146800640, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 79.72596740722656, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.10993031412363052, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 16.070152282714844, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.004250567406415939, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 6.09910249710083, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.02508348599076271, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 6.19337797164917, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.024250071495771408, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.6124396324157715, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.10210473835468292, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.586574554443359, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.10805269330739975, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 16.024118423461914, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.0030770106241106987, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 13.596692085266113, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.07527415454387665, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 9.670165061950684, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.07865290343761444, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 15.965954780578613, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.002927494700998068, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 5.534626483917236, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.010779489763081074, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 5.53407621383667, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.011287825182080269, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.403009414672852, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.06137095391750336, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.4213385581970215, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.047091543674468994, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 15.872982025146484, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.0034915395081043243, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 13.30893611907959, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.07827045023441315, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 9.413275718688965, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.03981134295463562, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 16.76524543762207, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.024798361584544182, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 111.92776489257812, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.07620520889759064} -{"step": 167772160, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 81.11697387695312, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.16381089389324188, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 16.07809829711914, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.00558520806953311, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 6.169198989868164, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.04769541695713997, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 6.268050193786621, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.043042413890361786, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.6397552490234375, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.13144974410533905, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.6147966384887695, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.1474735289812088, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 16.023719787597656, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.006838895846158266, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 13.682353019714355, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.11003199964761734, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 9.737309455871582, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.12985104322433472, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 15.963591575622559, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.0056404112838208675, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 5.583140850067139, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.0190704558044672, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 5.575356483459473, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.02610960602760315, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.403573036193848, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.12905162572860718, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.427280426025391, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.0765448659658432, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 15.870985984802246, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.004754543304443359, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 13.387280464172363, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.10355594754219055, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 9.471118927001953, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.05705927312374115, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 16.89935302734375, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.03674080967903137, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 116.56954193115234, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.13951759040355682} -{"step": 188743680, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 82.41007232666016, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.5650343894958496, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 16.087356567382812, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.016713658347725868, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 6.232445240020752, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.08184316009283066, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 6.33575439453125, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.09211808443069458, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.662089824676514, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.36882197856903076, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.637607097625732, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.3669591248035431, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 16.020296096801758, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.010138771496713161, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 13.752613067626953, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.24021956324577332, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 9.792492866516113, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.2693847417831421, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 15.962433815002441, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.012886950746178627, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 5.622135162353516, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.05372494086623192, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 5.611760139465332, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.09803672879934311, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.40715217590332, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.2795186936855316, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.436324119567871, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.17436639964580536, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 15.871770858764648, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.011147469282150269, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 13.46767807006836, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.25243741273880005, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 9.527655601501465, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.12640281021595, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 17.033374786376953, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.015848729759454727, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 121.22796630859375, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.25085940957069397} -{"step": 209715200, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 83.59319305419922, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.1461310237646103, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 16.096269607543945, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.007241293787956238, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 6.283783435821533, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.026022691279649734, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 6.389902114868164, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.02514633536338806, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.682618618011475, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.15873005986213684, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.657993316650391, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.1633145660161972, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 16.01824951171875, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.004135854076594114, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 13.818107604980469, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.10426928848028183, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 9.838113784790039, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.09121031314134598, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 15.95872688293457, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.0024691910948604345, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 5.644936561584473, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.014624416828155518, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 5.635018348693848, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.020583635196089745, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.410575866699219, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.07723336666822433, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.442782402038574, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.07240378856658936, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 15.87127685546875, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.002732686698436737, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 13.545246124267578, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.07344966381788254, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 9.572124481201172, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.046846773475408554, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 17.16185188293457, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.026306094601750374, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 125.76351928710938, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.1177796944975853} -{"step": 230686720, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 84.69913482666016, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.18743613362312317, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 16.104455947875977, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.00586846424266696, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 6.3361406326293945, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.04332682490348816, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 6.444019317626953, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.054454490542411804, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.695607662200928, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.16990546882152557, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.671508312225342, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.17567279934883118, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 16.01447296142578, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.004230795428156853, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 13.876253128051758, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.1073693260550499, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 9.875895500183105, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.09651000052690506, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 15.956161499023438, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.003555156523361802, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 5.674731731414795, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.021757839247584343, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 5.661999225616455, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.030668985098600388, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.41394567489624, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.08994776755571365, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.449863433837891, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.06560057401657104, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 15.875346183776855, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.0022305864840745926, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 13.629971504211426, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.05767200514674187, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 9.623741149902344, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.0379190631210804, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 17.293920516967773, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.02428075671195984, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 130.2657928466797, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.07473991811275482} -{"step": 251658240, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 85.73606872558594, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.30375155806541443, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 16.113414764404297, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.010266134515404701, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 6.393929481506348, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.06551215052604675, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 6.501550197601318, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.0825042873620987, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.706328868865967, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.2922962009906769, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.682680606842041, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.30217018723487854, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 16.012327194213867, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.007647043559700251, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 13.930069923400879, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.18516004085540771, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 9.907761573791504, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.19138911366462708, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 15.954083442687988, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.007459721527993679, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 5.707117557525635, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.034558966755867004, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 5.69045877456665, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.05369944870471954, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.41765832901001, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.21760663390159607, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.455567836761475, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.13044141232967377, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 15.88191032409668, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.008365784771740437, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 13.715105056762695, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.17823369801044464, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 9.674895286560059, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.11238112300634384, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 17.425983428955078, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.0328485369682312, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 134.69320678710938, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.1345190852880478} -{"step": 272629760, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 86.71636199951172, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.14572453498840332, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 16.122333526611328, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.005680927075445652, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 6.4540300369262695, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.028617724776268005, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 6.559982776641846, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.03740493208169937, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.714641571044922, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.1463662087917328, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.691328048706055, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.15533828735351562, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 16.01028823852539, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.00379374111071229, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 13.979470252990723, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.09458291530609131, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 9.934347152709961, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.08958129584789276, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 15.95456600189209, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.003523631487041712, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 5.7457275390625, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.01592237316071987, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 5.723804473876953, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.022755825892090797, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.422701835632324, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.0898897647857666, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.461367607116699, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.07619641721248627, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 15.889281272888184, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.00413482403382659, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 13.799829483032227, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.10851701349020004, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 9.72078800201416, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.06052534654736519, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 17.55770492553711, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.01679369807243347, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 139.0140838623047, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.06403554975986481} -{"step": 293601280, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 87.64751434326172, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.30461931228637695, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 16.130203247070312, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.01059363316744566, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 6.511850357055664, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.07350965589284897, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 6.615236759185791, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.08362721651792526, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.719867706298828, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.2967369556427002, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.697425365447998, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.2980238199234009, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 16.007770538330078, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.0066581652499735355, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 14.021233558654785, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.1923466920852661, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 9.956254959106445, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.19457945227622986, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 15.955404281616211, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.007921387441456318, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 5.7835187911987305, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.030712926760315895, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 5.756180286407471, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.045275039970874786, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.428191661834717, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.206342414021492, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.46806001663208, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.12375067919492722, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 15.896698951721191, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.005663768388330936, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 13.88111400604248, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.1365692913532257, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 9.767130851745605, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.09039931744337082, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 17.69210433959961, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.02481245994567871, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 143.26095581054688, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.11082061380147934} -{"step": 314572800, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 88.53726196289062, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.2071615606546402, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 16.136627197265625, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.006555133033543825, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 6.5657124519348145, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.04668228700757027, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 6.664050102233887, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.05848678946495056, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.722399711608887, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.18701401352882385, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.701833724975586, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.18739226460456848, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 16.003705978393555, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.003987635485827923, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 14.055682182312012, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.11685555428266525, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 9.973167419433594, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.09588383883237839, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 15.958077430725098, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.003974501509219408, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 5.822863578796387, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.0226253904402256, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 5.789274215698242, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.03616907447576523, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.434256553649902, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.10583212971687317, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.476086139678955, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.07587986439466476, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 15.906002044677734, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.002768697449937463, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 13.961162567138672, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.08153172582387924, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 9.812338829040527, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.051268771290779114, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 17.826908111572266, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.020027900114655495, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 147.4137725830078, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.062378719449043274} -{"step": 335544320, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 89.38903045654297, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.2912493646144867, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 16.143003463745117, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.010222469456493855, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 6.623172283172607, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.03463352844119072, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 6.716842174530029, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.04221378639340401, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.721902370452881, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.3336799144744873, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.702856540679932, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.3420841097831726, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 16.0008544921875, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.007723232265561819, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 14.090124130249023, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.21644122898578644, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 9.99006462097168, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.19878770411014557, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 15.96015739440918, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.006991758942604065, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 5.866939067840576, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.056607842445373535, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 5.825839042663574, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.07864652574062347, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.437992572784424, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.19980639219284058, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.480666160583496, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.1221950501203537, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 15.918475151062012, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.007247462868690491, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 14.04330825805664, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.14326068758964539, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 9.861323356628418, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.10641848295927048, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 17.96173667907715, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.016049351543188095, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 151.44456481933594, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.06723648309707642} -{"step": 356515840, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 90.19160461425781, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.1591528058052063, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 16.152223587036133, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.0057294294238090515, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 6.6835455894470215, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.03503607213497162, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 6.768935680389404, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.04263073951005936, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.724793910980225, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.1623111069202423, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.70596981048584, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.16757866740226746, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 15.999734878540039, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.003505300497636199, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 14.125237464904785, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.10629919171333313, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 10.006390571594238, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.09041694551706314, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 15.963029861450195, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.002821472706273198, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 5.908164024353027, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.01485923770815134, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 5.860414505004883, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.022679494693875313, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.442317008972168, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.09786002337932587, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.486057281494141, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.0730065181851387, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 15.923501968383789, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.002891237847507, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 14.10671329498291, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.07204169780015945, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 9.890390396118164, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.04835887625813484, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 18.088790893554688, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.01801745593547821, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 155.28445434570312, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.06099044531583786} -{"step": 377487360, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 90.9617691040039, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.17312587797641754, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 16.15790557861328, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.006930000148713589, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 6.738923072814941, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.03698308393359184, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 6.8157172203063965, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.050866320729255676, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.721988201141357, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.19926166534423828, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.703858852386475, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.2098204493522644, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 15.995676040649414, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.005349702201783657, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 14.150872230529785, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.12988822162151337, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 10.017280578613281, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.11969906836748123, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 15.968045234680176, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.005682238843291998, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 5.961435794830322, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.028948992490768433, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 5.904244422912598, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.04274699464440346, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.445154666900635, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.16648732125759125, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.490045547485352, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.09024994820356369, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 15.934530258178711, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.004325734917074442, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 14.176370620727539, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.11675412207841873, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 9.9265775680542, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.07732666283845901, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 18.220352172851562, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.014128396287560463, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 158.99964904785156, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.05434063822031021} -{"step": 398458880, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 91.69988250732422, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.1625666618347168, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 16.1634578704834, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.005967606324702501, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 6.794809818267822, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.03120761550962925, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 6.8634490966796875, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.03510899469256401, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.718137264251709, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.21221815049648285, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.7006940841674805, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.22497636079788208, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 15.992700576782227, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.004408417735248804, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 14.176698684692383, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.14412517845630646, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 10.029244422912598, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.12178158760070801, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 15.973468780517578, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.005009130109101534, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 6.015471458435059, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.031063493341207504, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 5.948024749755859, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.05005640909075737, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.447422981262207, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.14242486655712128, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.493185997009277, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.08576352894306183, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 15.945077896118164, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.0038443373050540686, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 14.239513397216797, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.10092013329267502, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 9.960701942443848, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.06673727184534073, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 18.352834701538086, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.01676461100578308, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 162.57763671875, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.06677546352148056} -{"step": 419430400, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 92.40455627441406, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.19464051723480225, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 16.169111251831055, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.006895106285810471, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 6.848182201385498, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.037793442606925964, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 6.90703821182251, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.04711604863405228, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.71437931060791, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.23571725189685822, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.697606563568115, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.245804101228714, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 15.989846229553223, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.005651076324284077, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 14.199382781982422, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.163923978805542, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 10.039670944213867, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.14285604655742645, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 15.978611946105957, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.004673923831433058, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 6.068851470947266, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.026641210541129112, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 5.9898600578308105, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.03985064476728439, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.449369430541992, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.1672488898038864, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.495371341705322, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.09803181886672974, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 15.955362319946289, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.005675895139575005, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 14.297798156738281, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.12617628276348114, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 9.991898536682129, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.0927279144525528, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 18.483121871948242, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.014503230340778828, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 165.99276733398438, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.06561563163995743} -{"step": 440401920, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 93.07496643066406, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.16403108835220337, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 16.173952102661133, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.006508692633360624, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 6.899662494659424, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.042267970740795135, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 6.94871711730957, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.04986581206321716, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.709960460662842, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.18087288737297058, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.694027900695801, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.19061879813671112, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 15.986421585083008, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.0039451043121516705, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 14.218998908996582, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.119062140583992, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 10.047853469848633, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.09841607511043549, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 15.985306739807129, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.003782752202823758, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 6.125688076019287, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.018775349482893944, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 6.033872127532959, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.031115662306547165, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.45194673538208, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.11925797909498215, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.498369216918945, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.0846695676445961, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 15.965619087219238, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.0037117807660251856, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 14.354398727416992, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.10008148103952408, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 10.021547317504883, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.06669706106185913, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 18.612438201904297, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.018189121037721634, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 169.23890686035156, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.06125329062342644} -{"step": 461373440, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 93.71412658691406, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.19683219492435455, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 16.179401397705078, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.007823188789188862, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 6.950921535491943, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.03351667523384094, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 6.989181995391846, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.03925786167383194, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.70599365234375, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.2585121691226959, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.69064474105835, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.25878316164016724, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 15.983689308166504, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.005440430715680122, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 14.23853588104248, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.16290313005447388, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 10.057229042053223, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.13549841940402985, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 15.990549087524414, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.005029157269746065, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 6.176910400390625, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.026766378432512283, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 6.07344913482666, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.0426713228225708, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.453160762786865, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.16901369392871857, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.499256610870361, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.10158620029687881, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 15.974098205566406, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.004531758837401867, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 14.40539264678955, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.128981351852417, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 10.048802375793457, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.08252288401126862, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 18.736473083496094, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.018941501155495644, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 172.30270385742188, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.08006859570741653} -{"step": 482344960, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 94.32404327392578, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.14391541481018066, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 16.184288024902344, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.005490276962518692, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 6.9999237060546875, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.0313778780400753, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 7.029140472412109, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.03726256638765335, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.702419281005859, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.18497532606124878, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.687327861785889, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.19823108613491058, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 15.979071617126465, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.004184789955615997, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 14.253561973571777, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.134581059217453, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 10.063922882080078, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.11159893870353699, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 15.997987747192383, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.003989513032138348, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 6.235393524169922, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.025027750059962273, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 6.117534637451172, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.0361638180911541, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.455029010772705, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.1396361142396927, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.502440452575684, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.08845683932304382, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 15.981711387634277, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.0039496212266385555, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 14.452568054199219, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.10886930674314499, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 10.072250366210938, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.07466840744018555, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 18.859399795532227, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.012811386026442051, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 175.2296142578125, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.05014338344335556} -{"step": 503316480, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 94.90668487548828, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.23886272311210632, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 16.18732261657715, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.009023028425872326, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 7.045780658721924, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.04068411886692047, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 7.06634521484375, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.051469966769218445, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.696258544921875, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.27988043427467346, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.681412220001221, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.27350547909736633, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 15.974518775939941, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.005939262453466654, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 14.26805305480957, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.17003081738948822, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 10.070292472839355, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.1395224630832672, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 16.005191802978516, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.0052534108981490135, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 6.2951154708862305, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.03824483975768089, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 6.162410259246826, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.05772445350885391, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.455235958099365, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.18310490250587463, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.504451751708984, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.106468066573143, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 15.9917573928833, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.004868643824011087, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 14.50096607208252, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.1290210485458374, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 10.099189758300781, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.09340953826904297, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 18.985214233398438, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.015595495700836182, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 178.02626037597656, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.0632520392537117} -{"step": 524288000, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 95.46420288085938, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.2129581719636917, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 16.190685272216797, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.00856686569750309, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 7.092848777770996, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.04024466499686241, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 7.102871894836426, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.052788734436035156, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.689743995666504, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.27469882369041443, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.6755547523498535, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.2710307240486145, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 15.970949172973633, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.005952667910605669, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 14.282840728759766, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.17691653966903687, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 10.076898574829102, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.15692855417728424, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 16.013633728027344, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.006688413675874472, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 6.358633995056152, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.028270628303289413, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 6.209234237670898, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.0481843464076519, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.455600261688232, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.2194948047399521, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.505981922149658, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.12435879558324814, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 16.000164031982422, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.008309058845043182, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 14.544983863830566, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.22180680930614471, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 10.12216567993164, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.1565789133310318, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 19.109657287597656, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.009453732520341873, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 180.68167114257812, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.08171714097261429} -{"step": 545259520, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 95.99844360351562, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.18740665912628174, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 16.19583511352539, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.007281564641743898, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 7.14235258102417, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.03683792054653168, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 7.142390727996826, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.0462929829955101, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.685661792755127, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.2317795306444168, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.671776294708252, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.2404719591140747, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 15.966657638549805, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.0052091749384999275, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 14.295928955078125, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.16333797574043274, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 10.083230018615723, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.12859077751636505, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 16.021970748901367, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.00401019724085927, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 6.41770601272583, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.027077989652752876, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 6.251038074493408, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.03986559808254242, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.457578182220459, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.15229885280132294, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.508563041687012, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.09678283333778381, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 16.005126953125, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.0046829720959067345, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 14.58091926574707, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.12198591232299805, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 10.14033031463623, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.08787982165813446, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 19.227602005004883, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.013751668855547905, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 183.16983032226562, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.05716250091791153} -{"step": 566231040, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 96.51044464111328, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.17694659531116486, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 16.19864273071289, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.006973331794142723, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 7.1834211349487305, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.039151158183813095, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 7.173957347869873, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.051588743925094604, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.679991245269775, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.22197405993938446, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.666493892669678, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.22331371903419495, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 15.961888313293457, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.0045488872565329075, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 14.307035446166992, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.1521735042333603, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 10.08819580078125, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.11506221443414688, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 16.031017303466797, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.0038797021843492985, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 6.481729984283447, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.02656433917582035, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 6.296924591064453, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.03974272683262825, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.457784175872803, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.14173448085784912, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.510071754455566, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.09016625583171844, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 16.01144790649414, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.00340847996994853, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 14.6165189743042, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.09916073828935623, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 10.158905029296875, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.06567379087209702, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 19.34688377380371, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.012913232669234276, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 185.54823303222656, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.06248726323246956} -{"step": 587202560, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 97.00144958496094, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.23179490864276886, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 16.201732635498047, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.008527200669050217, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 7.2272114753723145, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.04377027228474617, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 7.2084174156188965, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.052467286586761475, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.673587322235107, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.2544670104980469, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.659947872161865, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.2742431163787842, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 15.957656860351562, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.005645707715302706, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 14.318522453308105, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.1903870552778244, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 10.092937469482422, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.1578383594751358, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 16.040477752685547, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.007545360829681158, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 6.549569606781006, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.044972557574510574, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 6.34547758102417, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.07173489034175873, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.457699775695801, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.23084060847759247, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.510474681854248, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.13269191980361938, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 16.01741600036621, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.006983682978898287, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 14.65036392211914, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.19654256105422974, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 10.176472663879395, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.13935473561286926, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 19.465749740600586, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.008205851539969444, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 187.81744384765625, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.09624528884887695} -{"step": 608174080, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 97.46879577636719, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.13320878148078918, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 16.203590393066406, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.005859633907675743, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 7.267545223236084, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.02801077999174595, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 7.239126682281494, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.031031325459480286, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.66706657409668, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.19354937970638275, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.653336048126221, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.1933649480342865, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 15.953012466430664, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.0036049203481525183, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 14.328764915466309, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.1287236362695694, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 10.097709655761719, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.10812553763389587, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 16.050273895263672, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.0038616005331277847, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 6.616039276123047, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.02045992948114872, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 6.3916096687316895, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.029610557481646538, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.4578728675842285, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.1404556930065155, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.511390209197998, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.08819843083620071, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 16.021940231323242, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.0034827955532819033, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 14.680251121520996, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.10487622022628784, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 10.192811965942383, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.07754482328891754, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 19.58173370361328, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.012390636838972569, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 189.95823669433594, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.048110563308000565} -{"step": 629145600, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 97.91557312011719, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.2160152941942215, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 16.205007553100586, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.007660690229386091, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 7.304483413696289, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.04169708490371704, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 7.267967224121094, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.05892377346754074, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.660190105438232, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.266580730676651, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.646608352661133, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.2515704035758972, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 15.94870376586914, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.006277965381741524, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 14.337830543518066, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.163531094789505, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 10.10193157196045, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.138605996966362, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 16.061737060546875, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.005184305366128683, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 6.688570499420166, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.02722938358783722, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 6.4403228759765625, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.04050551354885101, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.458732604980469, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.18055863678455353, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.514204502105713, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.10924617201089859, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 16.02617645263672, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.005308777093887329, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 14.708487510681152, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.1462230533361435, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 10.20789909362793, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.11119861900806427, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 19.69662094116211, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.009597444906830788, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 191.9904022216797, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.07349905371665955} -{"step": 650117120, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 98.34375762939453, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.1937798708677292, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 16.20713996887207, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.006351806689053774, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 7.340714454650879, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.049672771245241165, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 7.296588897705078, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.054893169552087784, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.654265880584717, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.20760461688041687, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.640387058258057, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.20577549934387207, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 15.944737434387207, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.004174219910055399, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 14.34683609008789, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.1394301801919937, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 10.106014251708984, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.10927893966436386, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 16.073814392089844, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.003938405308872461, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 6.764244079589844, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.024925576522946358, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 6.490562438964844, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.038796987384557724, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.459455490112305, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.14818324148654938, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.515501976013184, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.09483892470598221, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 16.0281925201416, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.004236053209751844, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 14.731428146362305, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.12587976455688477, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 10.218998908996582, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.08365761488676071, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 19.806432723999023, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.011215273290872574, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 193.9045867919922, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.0709029883146286} -{"step": 671088640, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 98.75276184082031, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.16890771687030792, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 16.207304000854492, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.006796128582209349, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 7.375855922698975, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.0322612039744854, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 7.324141979217529, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.04868883639574051, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.646538257598877, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.22400739789009094, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.632169723510742, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.2319660633802414, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 15.939297676086426, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.004693556576967239, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 14.353425979614258, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.15652212500572205, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 10.108977317810059, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.12168245762586594, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 16.0876522064209, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.005786409601569176, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 6.844892501831055, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.02900831773877144, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 6.5444793701171875, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.05065084993839264, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.4602580070495605, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.19486667215824127, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.517537593841553, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.1036510244011879, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 16.032089233398438, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.005529586225748062, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 14.756386756896973, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.14802534878253937, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 10.231781005859375, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.10370408743619919, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 19.917783737182617, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.008851547725498676, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 195.72335815429688, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.06480858474969864} -{"step": 692060160, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 99.14350891113281, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.18792445957660675, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 16.20665168762207, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.0059836856089532375, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 7.4056501388549805, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.0378868393599987, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 7.347189426422119, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.05957099422812462, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.638116359710693, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.23122748732566833, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.623458385467529, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.23385579884052277, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 15.93490982055664, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.004877936094999313, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 14.360335350036621, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.16353163123130798, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 10.1118745803833, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.12102138996124268, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 16.101869583129883, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.0050003728829324245, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 6.92793083190918, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.019052548334002495, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 6.599695205688477, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.03249192610383034, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.460206985473633, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.16674306988716125, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.51927375793457, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.09624015539884567, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 16.034385681152344, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.004022499546408653, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 14.776030540466309, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.11830127984285355, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 10.242484092712402, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.08380503207445145, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 20.027042388916016, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.011011815629899502, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 197.4512481689453, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.057402629405260086} -{"step": 713031680, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 99.51747131347656, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.1992204785346985, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 16.206153869628906, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.0073252529837191105, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 7.434369087219238, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.04047240689396858, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 7.369598388671875, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.05483345314860344, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.630821228027344, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.2436896413564682, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.615761756896973, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.24364270269870758, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 15.929914474487305, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.005211376119405031, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 14.366229057312012, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.16412408649921417, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 10.114655494689941, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.12700264155864716, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 16.115434646606445, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.004927243571728468, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 7.009395599365234, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.025404080748558044, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 6.653809070587158, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.040227822959423065, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.460838317871094, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.16747574508190155, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.521536350250244, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.10214471071958542, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 16.036109924316406, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.004297580104321241, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 14.794177055358887, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.1131877675652504, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 10.25202751159668, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.07444637268781662, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 20.133089065551758, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.010427573695778847, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 199.0926513671875, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.06068232282996178} -{"step": 734003200, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 99.87421417236328, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.14770953357219696, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 16.20499610900879, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.005312946625053883, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 7.462728500366211, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.03495730087161064, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 7.391595840454102, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.04015907272696495, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.6229448318481445, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.19105082750320435, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.607223033905029, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.19038763642311096, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 15.925215721130371, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.003656572662293911, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 14.371471405029297, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.1333768367767334, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 10.116661071777344, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.10832662135362625, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 16.129770278930664, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.0039011212065815926, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 7.09256649017334, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.01882564090192318, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 6.708517074584961, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.03131445124745369, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.461476802825928, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.13706405460834503, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.523157119750977, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.08868294954299927, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 16.037691116333008, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.003393124556168914, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 14.811773300170898, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.10641682893037796, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 10.261728286743164, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.07753830403089523, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 20.237979888916016, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.008402517065405846, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 200.65077209472656, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.04768432304263115} -{"step": 754974720, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 100.21381378173828, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.1598212867975235, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 16.203920364379883, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.006124094594269991, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 7.49249792098999, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.033307209610939026, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 7.41510534286499, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.03834507241845131, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.614730358123779, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.19618023931980133, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.59840726852417, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.20103719830513, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 15.91913890838623, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.004209260456264019, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 14.374846458435059, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.14273720979690552, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 10.117693901062012, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.11185507476329803, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 16.144805908203125, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.004436841234564781, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 7.177601337432861, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.02994978055357933, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 6.764934539794922, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.04478539526462555, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.461979389190674, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.1451651006937027, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.525569915771484, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.09219705313444138, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 16.03875160217285, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.0038482875097543, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 14.827352523803711, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.1101423054933548, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 10.270072937011719, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.08090092986822128, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 20.341751098632812, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.011534017510712147, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 202.1275634765625, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.04739595577120781} -{"step": 775946240, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 100.53907775878906, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.15041188895702362, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 16.20145606994629, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.005664832424372435, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 7.518926620483398, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.03157982975244522, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 7.435159683227539, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.03807952627539635, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.606184959411621, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.19896109402179718, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.589370250701904, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.21153582632541656, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 15.914752960205078, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.004207780584692955, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 14.379494667053223, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.14511820673942566, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 10.119644165039062, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.1150858998298645, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 16.160945892333984, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.004575198050588369, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 7.262684345245361, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.018528541550040245, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 6.821331024169922, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.030843783169984818, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.463128089904785, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.1604972928762436, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.528563976287842, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.09221888333559036, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 16.038949966430664, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.004677980672568083, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 14.840797424316406, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.12646643817424774, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 10.277005195617676, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.08775987476110458, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 20.442516326904297, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.010175570845603943, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 203.53268432617188, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.07237948477268219} -{"step": 796917760, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 100.85066986083984, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.1554049700498581, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 16.200231552124023, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.005884641315788031, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 7.546838283538818, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.0315103679895401, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 7.4567131996154785, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.042503852397203445, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.597506046295166, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.2025754302740097, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.580153465270996, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.21013079583644867, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 15.909207344055176, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.003938061650842428, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 14.381635665893555, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.14168724417686462, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 10.120292663574219, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.11735670268535614, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 16.177385330200195, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.004249020479619503, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 7.351010322570801, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.02769741788506508, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 6.880019664764404, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.04642076417803764, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.4639482498168945, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.16216322779655457, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.530672550201416, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.10125322639942169, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 16.03976821899414, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.004411919973790646, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 14.854723930358887, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.13190525770187378, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 10.283645629882812, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.09845489263534546, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 20.541362762451172, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.011469212360680103, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 204.87496948242188, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.07506503164768219} -{"step": 817889280, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 101.14683532714844, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.22000934183597565, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 16.19742774963379, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.006730210967361927, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 7.571593761444092, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.041028186678886414, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 7.475602149963379, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.051956236362457275, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.588323593139648, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.23964934051036835, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.570454120635986, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.23196707665920258, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 15.904508590698242, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.004638558719307184, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 14.385554313659668, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.16733814775943756, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 10.121813774108887, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.12483595311641693, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 16.194238662719727, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.004957914352416992, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 7.4375691413879395, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.025799276307225227, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 6.93694543838501, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.04773366078734398, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.46498441696167, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.1819290965795517, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.532857894897461, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.10264655202627182, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 16.03961181640625, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.0052210669964551926, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 14.86609935760498, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.14571766555309296, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 10.28909683227539, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.09984617680311203, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 20.637245178222656, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.008675891906023026, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 206.15325927734375, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.060103755444288254} -{"step": 838860800, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 101.4269790649414, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.16548685729503632, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 16.193157196044922, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.00631866417825222, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 7.592062950134277, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.03739091008901596, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 7.491156101226807, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.04775504022836685, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.579164505004883, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.22641827166080475, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.560720443725586, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.2360401153564453, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 15.898601531982422, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.0047321259044110775, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 14.387088775634766, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.159836083650589, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 10.122413635253906, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.12747898697853088, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 16.211490631103516, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.005227172281593084, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 7.524510860443115, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.023571234196424484, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 6.993638515472412, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.03784516826272011, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.46632194519043, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.1865781992673874, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.535778999328613, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.10331599414348602, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 16.040176391601562, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.004318274091929197, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 14.877476692199707, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.12483514845371246, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 10.294609069824219, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.09337209910154343, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 20.731260299682617, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.011061922647058964, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 207.3667449951172, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.05369976535439491} -{"step": 859832320, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 101.69369506835938, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.1903057098388672, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 16.189407348632812, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.0077988444827497005, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 7.613979816436768, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.03927868977189064, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 7.5073933601379395, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.05319715291261673, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.569584846496582, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.24764220416545868, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.550307750701904, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.28398656845092773, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 15.893308639526367, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.007928508333861828, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 14.388934135437012, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.20046375691890717, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 10.123087882995605, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.1666947603225708, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 16.229595184326172, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.007952384650707245, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 7.6141767501831055, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.04599399492144585, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 7.051896095275879, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.08062897622585297, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.467202186584473, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.3016878664493561, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.5380048751831055, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.142494797706604, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 16.038970947265625, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.009163481183350086, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 14.884516716003418, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.2471342533826828, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 10.298942565917969, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.17914989590644836, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 20.8252010345459, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.01652098074555397, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 208.52133178710938, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.09497573226690292} -{"step": 880803840, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 101.94709014892578, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.14354360103607178, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 16.184633255004883, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.00488458713516593, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 7.6311750411987305, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.029127709567546844, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 7.519965171813965, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.03381664678454399, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.560546875, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.1887412965297699, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.540244102478027, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.20072545111179352, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 15.888506889343262, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.004862729460000992, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 14.391074180603027, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.1442122906446457, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 10.124072074890137, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.11047545075416565, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 16.248088836669922, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.004163730889558792, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 7.703055381774902, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.020309334620833397, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 7.109245777130127, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.03782542794942856, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.4680585861206055, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.15429525077342987, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.540668964385986, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.08723444491624832, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 16.037872314453125, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.0038573334459215403, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 14.892520904541016, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.10208939760923386, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 10.302589416503906, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.07274169474840164, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 20.914033889770508, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.010438255034387112, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 209.62319946289062, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.052713461220264435} -{"step": 901775360, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 102.18645477294922, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.16836577653884888, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 16.1790771484375, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.006027868017554283, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 7.646666526794434, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.033254820853471756, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 7.53118371963501, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.04246770590543747, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.551477909088135, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.2060999870300293, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.530262470245361, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.2116403728723526, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 15.883190155029297, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.004019757267087698, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 14.391355514526367, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.14800037443637848, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 10.123870849609375, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.1161179393529892, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 16.267032623291016, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.0040825167670845985, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 7.7931365966796875, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.017604848369956017, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 7.166900157928467, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.030126966536045074, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.468708515167236, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.1451670527458191, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.543340682983398, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.09584832936525345, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 16.03658103942871, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.0035786242224276066, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 14.898760795593262, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.11105644702911377, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 10.305675506591797, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.08189026266336441, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 21.002134323120117, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.008701051585376263, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 210.66497802734375, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.04426143690943718} -{"step": 922746880, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 102.4144058227539, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.2262229025363922, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 16.174156188964844, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.006924582179635763, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 7.666111469268799, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.04799668863415718, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 7.544976234436035, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.057134419679641724, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.542266368865967, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.2837378978729248, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.520455837249756, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.3030804693698883, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 15.87717056274414, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.007448333781212568, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 14.391164779663086, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.21015238761901855, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 10.123257637023926, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.16887028515338898, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 16.286304473876953, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.0070360759273171425, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 7.883894443511963, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.03637872263789177, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 7.22528076171875, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.07104922086000443, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.4690351486206055, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.2768845856189728, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.545424938201904, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.13239464163780212, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 16.0357723236084, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.009587615728378296, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 14.905253410339355, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.24251806735992432, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 10.309744834899902, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.1801442950963974, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 21.08904457092285, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.015330053865909576, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 211.66465759277344, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.09232188016176224} -{"step": 943718400, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 102.63108825683594, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.1274511069059372, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 16.16970443725586, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.004951883107423782, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 7.684271812438965, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.025131331756711006, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 7.558343410491943, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.029680989682674408, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.533072471618652, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.18317833542823792, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.510698318481445, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.19557005167007446, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 15.872660636901855, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.004015302751213312, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 14.392345428466797, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.14174610376358032, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 10.123438835144043, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.11145199835300446, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 16.304624557495117, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.004190186504274607, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 7.970953464508057, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.019785325974225998, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 7.280458927154541, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.03551844134926796, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.469304084777832, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.15011508762836456, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.547247886657715, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.08944645524024963, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 16.033184051513672, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.0039869630709290504, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 14.908336639404297, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.1184912919998169, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 10.310449600219727, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.0827411636710167, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 21.17138671875, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.00847682636231184, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 212.61061096191406, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.04271220043301582} -{"step": 964689920, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 102.83616638183594, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.19310571253299713, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 16.16387367248535, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.0075220088474452496, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 7.698805332183838, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.04118522256612778, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 7.568568706512451, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.050233397632837296, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.523387908935547, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.27859291434288025, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.500224590301514, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.28574270009994507, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 15.86756706237793, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.0056863510981202126, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 14.392171859741211, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.20576730370521545, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 10.122933387756348, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.15938805043697357, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 16.323240280151367, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.005858907476067543, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 8.059067726135254, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.030604302883148193, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 7.337477684020996, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.06020323187112808, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.469053745269775, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.2537599802017212, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.548333644866943, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.12329892814159393, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 16.03171157836914, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.006598007399588823, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 14.91299057006836, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.17954617738723755, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 10.312603950500488, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.13246680796146393, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 21.253263473510742, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.008732150308787823, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 213.51876831054688, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.08135558664798737} -{"step": 985661440, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 103.02790832519531, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.18502932786941528, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 16.15755844116211, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.006532495841383934, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 7.711118221282959, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.03459898754954338, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 7.577378273010254, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.04459018632769585, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.513943672180176, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.24198126792907715, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.489967346191406, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.2540217638015747, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 15.862030982971191, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.005323528777807951, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 14.390410423278809, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.18112412095069885, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 10.121883392333984, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.13146111369132996, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 16.342395782470703, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.005483789369463921, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 8.148442268371582, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.019464435055851936, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 7.395526885986328, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.03815377503633499, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.469831466674805, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.18156155943870544, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.5504350662231445, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.10218934714794159, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 16.029966354370117, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.004849160555750132, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 14.916570663452148, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.1342330276966095, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 10.314513206481934, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.09413942694664001, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 21.33277130126953, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.009822580963373184, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 214.37535095214844, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.046385060995817184} -{"step": 1006632960, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 103.20852661132812, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.13846439123153687, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 16.150636672973633, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.005481349769979715, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 7.722568988800049, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.02943485416471958, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 7.5855512619018555, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.03407224640250206, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.504920959472656, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.2122947871685028, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.480017185211182, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.2303488850593567, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 15.856900215148926, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.004609906114637852, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 14.388866424560547, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.16954997181892395, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 10.120835304260254, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.12236542254686356, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 16.361698150634766, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.004596853628754616, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 8.236722946166992, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.018330125138163567, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 7.452226638793945, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.03403463959693909, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.470539093017578, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.16736820340156555, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.552735328674316, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.09717044979333878, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 16.028366088867188, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.003690483747050166, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 14.919715881347656, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.11369588226079941, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 10.31596851348877, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.07203711569309235, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 21.41002082824707, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.009517679922282696, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 215.1880340576172, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.045718055218458176} -{"step": 1027604480, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 103.37842559814453, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.17097049951553345, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 16.14415168762207, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.005818004719913006, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 7.733946323394775, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.03953796997666359, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 7.593763828277588, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.04540498927235603, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.495234966278076, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.20573841035366058, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.469447135925293, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.22865533828735352, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 15.852123260498047, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.004927672911435366, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 14.387645721435547, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.17744459211826324, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 10.119861602783203, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.13431575894355774, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 16.381933212280273, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.005823800805956125, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 8.328179359436035, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.02537188120186329, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 7.510344505310059, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.05533522740006447, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.471221446990967, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.1884235292673111, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.554962635040283, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.10737979412078857, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 16.02585220336914, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.006119125057011843, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 14.92074203491211, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.16404257714748383, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 10.316360473632812, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.1146012619137764, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 21.485862731933594, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.006470391061156988, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 215.96669006347656, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.08118434250354767} -{"step": 1048576000, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 103.53921508789062, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.15250295400619507, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 16.137027740478516, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.0056764353066682816, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 7.742634296417236, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.03211706131696701, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 7.599850654602051, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.04062773287296295, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.485884666442871, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.2343633472919464, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.459433555603027, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.24489594995975494, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 15.848189353942871, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.0049665095284581184, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 14.387279510498047, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.17910099029541016, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 10.119373321533203, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.13394367694854736, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 16.400287628173828, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.005104880779981613, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 8.412029266357422, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.019640835002064705, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 7.5629072189331055, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.04170290008187294, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.471403121948242, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.19912932813167572, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.555972576141357, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.10732351988554001, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 16.023094177246094, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.004502784926444292, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 14.920802116394043, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.13237512111663818, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 10.316429138183594, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.09653942286968231, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 21.55845832824707, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.008256547152996063, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 216.71029663085938, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.06455971300601959} -{"step": 1069547520, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 103.68973541259766, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.13277128338813782, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 16.130191802978516, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.004817625042051077, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 7.753049373626709, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.02970849722623825, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 7.606976509094238, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.03631262108683586, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.476943016052246, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.1878119856119156, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.449827194213867, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.21054595708847046, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 15.842676162719727, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.003939538262784481, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 14.38418960571289, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.14629168808460236, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 10.117608070373535, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.11295537650585175, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 16.419662475585938, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.004433369264006615, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 8.498003959655762, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.02039630152285099, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 7.6166181564331055, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.03607232868671417, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.472062110900879, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.1549767553806305, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.558233261108398, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.0902276486158371, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 16.0206298828125, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.004178436938673258, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 14.92137336730957, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.10629186034202576, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 10.316780090332031, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.08424381911754608, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 21.630090713500977, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.008926081471145153, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 217.4196014404297, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.04491548240184784} -{"step": 1090519040, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 103.82997131347656, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.2123381793498993, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 16.121179580688477, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.007113635074347258, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 7.760039806365967, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.04484996572136879, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 7.610993385314941, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.0511600635945797, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.467049598693848, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.26573285460472107, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.439029693603516, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.2928977310657501, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 15.838415145874023, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.006002267822623253, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 14.382834434509277, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.22287355363368988, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 10.116966247558594, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.17258881032466888, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 16.439586639404297, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.007282237056642771, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 8.586191177368164, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.03477633744478226, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 7.671808242797852, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.07583872228860855, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.472545623779297, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.27810540795326233, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.5603532791137695, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.1360854059457779, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 16.01826286315918, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.008996556513011456, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 14.922319412231445, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.23445704579353333, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 10.317099571228027, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.17156386375427246, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 21.70013999938965, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.006682653445750475, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 218.09317016601562, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.09797023981809616} -{"step": 1111490560, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 103.96247100830078, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.13868118822574615, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 16.113779067993164, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.005250274669378996, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 7.767143249511719, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.03012949414551258, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 7.615289211273193, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.037780750542879105, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.458234786987305, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.20290571451187134, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.429266452789307, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.21011364459991455, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 15.833580017089844, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.004129602108150721, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 14.380024909973145, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.1530473828315735, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 10.115361213684082, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.11847081780433655, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 16.45855712890625, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.004082109779119492, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 8.671676635742188, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.016772501170635223, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 7.724644184112549, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.02980772778391838, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.472939491271973, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.1493576169013977, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.561786651611328, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.0973515585064888, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 16.016138076782227, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.004542010836303234, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 14.922600746154785, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.12575873732566833, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 10.31704044342041, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.0968194380402565, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 21.76634979248047, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.007979333400726318, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 218.73269653320312, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.04688430577516556} -{"step": 1132462080, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 104.08636474609375, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.120073601603508, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 16.105073928833008, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.004408315755426884, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 7.7717061042785645, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.029728079214692116, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 7.617607116699219, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.034636907279491425, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.448611259460449, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.17545853555202484, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.418985366821289, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.20262065529823303, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 15.82935905456543, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.003749098163098097, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 14.378005027770996, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.14890924096107483, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 10.114208221435547, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.11339367926120758, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 16.47893714904785, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.00393409701064229, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 8.759495735168457, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.01597997546195984, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 7.778600692749023, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.03523553907871246, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.473773002624512, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.1407652199268341, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.563863754272461, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.08517894893884659, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 16.013280868530273, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.0034993235021829605, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 14.921398162841797, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.10676004737615585, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 10.315776824951172, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.08326132595539093, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 21.8314266204834, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.007974893786013126, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 219.34686279296875, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.04672928899526596} -{"step": 1153433600, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 104.20026397705078, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.12122420221567154, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 16.097309112548828, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.004246823489665985, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 7.780515193939209, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.02808750607073307, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 7.623676300048828, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.035294320434331894, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.439157962799072, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.16921408474445343, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.408524990081787, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.18770410120487213, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 15.82486343383789, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.0039095813408494, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 14.375362396240234, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.13721004128456116, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 10.11288833618164, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.10592159628868103, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 16.50039291381836, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.0043614632450044155, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 8.849634170532227, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.01691398210823536, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 7.834761142730713, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.030306363478302956, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.474414825439453, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.13896244764328003, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.565788269042969, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.08192513138055801, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 16.01087188720703, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.003296987619251013, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 14.920491218566895, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.08899592608213425, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 10.315016746520996, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.0681302472949028, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 21.895658493041992, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.008948689326643944, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 219.93605041503906, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.037297800183296204} -{"step": 1174405120, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 104.30624389648438, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.16874071955680847, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 16.08860206604004, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.006537494249641895, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 7.785733222961426, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.03734278306365013, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 7.626766681671143, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.04993724450469017, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.429654598236084, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.24585014581680298, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.398253440856934, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.25182148814201355, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 15.82066535949707, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.005614143796265125, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 14.373138427734375, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.18812111020088196, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 10.111496925354004, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.13958661258220673, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 16.52095603942871, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.005102524999529123, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 8.935851097106934, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.021711409091949463, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 7.88767671585083, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.042384471744298935, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.47512149810791, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.19708973169326782, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.56776237487793, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.10619157552719116, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 16.00830078125, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.004781789146363735, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 14.919118881225586, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.13362550735473633, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 10.313972473144531, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.09846112132072449, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 21.95686149597168, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.007729543838649988, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 220.49261474609375, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.056331515312194824} -{"step": 1195376640, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 104.40386199951172, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.1443091481924057, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 16.080127716064453, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.005400918889790773, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 7.790866374969482, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.03997375816106796, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 7.629341125488281, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.043772466480731964, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.4206366539001465, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.20094364881515503, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.38850212097168, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.21942643821239471, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 15.81614875793457, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.004109491128474474, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 14.369528770446777, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.15732170641422272, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 10.109539985656738, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.11951326578855515, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 16.541399002075195, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.00489388732239604, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 9.021750450134277, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.016986126080155373, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 7.940754413604736, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.03242585435509682, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.475836753845215, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.15780499577522278, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.569888591766357, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.09611479938030243, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 16.006059646606445, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.0035207041073590517, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 14.918004035949707, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.10379374027252197, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 10.313362121582031, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.0799737498164177, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 22.01519775390625, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.008530901744961739, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 221.02005004882812, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.04206881672143936} -{"step": 1216348160, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 104.49425506591797, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.12834088504314423, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 16.070816040039062, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.004602066706866026, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 7.792155742645264, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.03019305318593979, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 7.629029750823975, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.037903059273958206, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.411299228668213, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.1846567541360855, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.378029823303223, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.19808973371982574, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 15.81168270111084, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.004035722464323044, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 14.365976333618164, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.14356204867362976, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 10.107963562011719, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.11312804371118546, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 16.562915802001953, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.004151518922299147, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 9.110876083374023, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.017268314957618713, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 7.995306968688965, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.03543175756931305, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.476946830749512, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.16098855435848236, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.572293758392334, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.08949856460094452, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 16.003738403320312, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.0037671553436666727, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 14.916715621948242, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.10819061845541, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 10.31284236907959, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.0876147672533989, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 22.07240867614746, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.009745069779455662, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 221.52281188964844, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.05137338489294052} -{"step": 1237319680, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 104.57857513427734, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.11640333384275436, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 16.061397552490234, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.004486315883696079, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 7.791942119598389, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.029062999412417412, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 7.627524375915527, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.03526671230792999, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.4020915031433105, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.17023293673992157, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.367763042449951, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.20399853587150574, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 15.807469367980957, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.0045611136592924595, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 14.362671852111816, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.16185320913791656, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 10.106319427490234, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.12049593031406403, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 16.58426856994629, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.004548208322376013, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 9.197653770446777, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.018378842622041702, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 8.048360824584961, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.03713280335068703, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.4779181480407715, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.15868349373340607, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.574751377105713, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.08918557316064835, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 16.00111198425293, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.0037140557542443275, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 14.914753913879395, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.1148362010717392, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 10.311934471130371, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.0828116312623024, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 22.127599716186523, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.0071547022089362144, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 222.00331115722656, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.05381985008716583} -{"step": 1258291200, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 104.65559387207031, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.15479178726673126, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 16.051982879638672, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.0056271981447935104, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 7.794132232666016, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.03376675397157669, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 7.628163814544678, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.03824072331190109, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.392588138580322, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.239504873752594, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.357595443725586, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.2761199176311493, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 15.804226875305176, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.005081983748823404, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 14.360411643981934, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.2165038287639618, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 10.105204582214355, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.16231860220432281, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 16.6048641204834, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.006050730124115944, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 9.280817031860352, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.024710409343242645, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 8.099274635314941, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.05710914358496666, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.4782233238220215, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.24823340773582458, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.576200485229492, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.12348252534866333, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 15.99890422821045, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.007837141864001751, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 14.913162231445312, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.19733473658561707, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 10.311267852783203, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.1502806693315506, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 22.180641174316406, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.010703428648412228, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 222.4578094482422, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.07530856877565384} -{"step": 1279262720, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 104.72763061523438, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.1229504942893982, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 16.042465209960938, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.005165172275155783, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 7.792737007141113, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.02891668491065502, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 7.6258063316345215, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.03314577043056488, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.3840107917785645, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.1909920573234558, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.348453521728516, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.2152715027332306, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 15.8007230758667, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.003918906208127737, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 14.357582092285156, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.1624007523059845, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 10.103498458862305, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.12473740428686142, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 16.624242782592773, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.004190982319414616, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 9.358009338378906, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.016766544431447983, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 8.145902633666992, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.030247613787651062, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.4792561531066895, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.17007869482040405, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.578405857086182, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.09653685241937637, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 15.995818138122559, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.0030717148911207914, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 14.910046577453613, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.09919384866952896, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 10.309524536132812, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.0772256925702095, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 22.23041343688965, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.007812993600964546, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 222.88912963867188, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.040589094161987305} -{"step": 1300234240, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 104.79364013671875, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.14237770438194275, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 16.033334732055664, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.004999454598873854, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 7.794217109680176, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.03952721878886223, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 7.625731945037842, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.05207112058997154, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.375442981719971, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.20706315338611603, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.339385032653809, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.2326260656118393, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 15.796833992004395, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.004903561435639858, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 14.353879928588867, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.1757020354270935, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 10.101561546325684, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.13490460813045502, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 16.644407272338867, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.004986016545444727, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 9.437183380126953, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.02168312855064869, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 8.194069862365723, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.0395100899040699, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.480190753936768, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.17677593231201172, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.580641269683838, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.0951966792345047, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 15.99330997467041, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.003822125494480133, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 14.907413482666016, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.11359522491693497, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 10.307971000671387, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.08638717234134674, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 22.278980255126953, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.009429089725017548, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 223.30198669433594, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.039970383048057556} -{"step": 1321205760, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 104.85383605957031, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.11980850249528885, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 16.02395248413086, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.0041864486411213875, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 7.794025421142578, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.027717268094420433, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 7.6245598793029785, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.03190067037940025, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.3667097091674805, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.17189499735832214, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.330265045166016, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.20685476064682007, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 15.793643951416016, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.004100379534065723, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 14.351226806640625, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.16153450310230255, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 10.10039234161377, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.12306202948093414, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 16.664167404174805, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.004616128746420145, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 9.515366554260254, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.018149929121136665, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 8.241915702819824, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.04054482653737068, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.480687618255615, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.16705505549907684, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.582277774810791, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.0884254202246666, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 15.990947723388672, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.0031870808452367783, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 14.904937744140625, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.0969775840640068, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 10.306539535522461, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.07198255509138107, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 22.325700759887695, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.008748446591198444, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 223.6920928955078, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.0450456365942955} -{"step": 1342177280, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 104.9077377319336, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.12851585447788239, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 16.01484489440918, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.004661716986447573, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 7.79237699508667, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.03169342875480652, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 7.622257709503174, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.03802288696169853, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.358614921569824, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.18418559432029724, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.321799278259277, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.20726437866687775, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 15.790168762207031, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.004197420086711645, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 14.347872734069824, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.1618937999010086, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 10.098784446716309, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.12697763741016388, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 16.68320083618164, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.0044013261795043945, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 9.591217041015625, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.016468828544020653, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 8.287948608398438, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.03310227766633034, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.481343746185303, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.1623481661081314, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.584101676940918, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.09347938746213913, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 15.988179206848145, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.0034869934897869825, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 14.9019193649292, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.10626473277807236, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 10.305159568786621, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.08674795180559158, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 22.37054443359375, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.008975155651569366, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 224.0583953857422, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.038784127682447433} -{"step": 1363148800, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 104.95699310302734, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.13106197118759155, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 16.00577735900879, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.00441513629630208, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 7.790521621704102, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.041328929364681244, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 7.619964122772217, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.05257504805922508, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.350550651550293, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.17535753548145294, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.313118934631348, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.21485452353954315, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 15.786994934082031, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.004263914655894041, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 14.344681739807129, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.16331933438777924, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 10.097189903259277, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.12461083382368088, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 16.70192527770996, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.0050458782352507114, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 9.66519832611084, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.01969754323363304, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 8.333105087280273, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.040500104427337646, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.481988430023193, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.16179925203323364, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.585750102996826, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.08745510876178741, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 15.985800743103027, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.0032901347149163485, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 14.899432182312012, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.10416936129331589, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 10.3040189743042, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.08049898594617844, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 22.413549423217773, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.006989380810409784, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 224.40870666503906, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.04113194718956947} -{"step": 1384120320, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 105.00118255615234, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.1297096610069275, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 15.996825218200684, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.004796045366674662, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 7.790221214294434, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.03816244751214981, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 7.618624210357666, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.050038471817970276, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.342316150665283, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.17795740067958832, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.304605484008789, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.20956312119960785, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 15.784363746643066, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.004438872914761305, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 14.342273712158203, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.16345641016960144, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 10.095829010009766, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.12163975089788437, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 16.719762802124023, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.0044663213193416595, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 9.734810829162598, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.017093630507588387, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 8.37569808959961, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.03461218997836113, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.4821248054504395, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.17206557095050812, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.586771011352539, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.09131931513547897, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 15.984023094177246, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.003346768906340003, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 14.897669792175293, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.1098959669470787, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 10.303421020507812, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.08131074160337448, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 22.454565048217773, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.008410343900322914, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 224.73782348632812, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.039824631065130234} -{"step": 1405091840, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 105.04141235351562, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.10980510711669922, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 15.988362312316895, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.004613295663148165, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 7.788238525390625, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.02730417810380459, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 7.616106033325195, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.03118818625807762, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.334522724151611, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.17463168501853943, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.296504020690918, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.21950732171535492, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 15.781804084777832, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.0048908391036093235, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 14.339812278747559, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.18363060057163239, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 10.094611167907715, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.1302187442779541, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 16.737199783325195, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.005056271329522133, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 9.802739143371582, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.017178647220134735, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 8.41719913482666, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.03809356689453125, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.482367515563965, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.16844430565834045, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.587932109832764, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.09494619816541672, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 15.981934547424316, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.004396616481244564, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 14.895424842834473, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.11584670096635818, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 10.302506446838379, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.08720508217811584, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 22.493118286132812, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.0073248883709311485, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 225.0458221435547, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.04171950742602348} -{"step": 1426063360, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 105.07759094238281, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.12269166111946106, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 15.980728149414062, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.004413495305925608, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 7.788037300109863, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.02931618131697178, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 7.6149702072143555, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.03542456775903702, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.327334880828857, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.1777166873216629, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.289252758026123, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.20713862776756287, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 15.779505729675293, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.004148393869400024, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 14.337306022644043, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.16806180775165558, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 10.0934476852417, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.1252432018518448, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 16.7537899017334, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.005202798172831535, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 9.867013931274414, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.015789996832609177, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 8.45602035522461, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.03207092359662056, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.482585430145264, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.16751034557819366, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.588929653167725, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.08985328674316406, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 15.980103492736816, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.004591639619320631, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 14.893207550048828, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.12081374228000641, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 10.301575660705566, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.09551629424095154, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 22.52956771850586, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.006191882770508528, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 225.33660888671875, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.040660589933395386} -{"step": 1447034880, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 105.1097412109375, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.10361326485872269, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 15.972902297973633, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.0040955012664198875, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 7.786749839782715, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.02606782503426075, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 7.613173484802246, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.029982388019561768, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.320376396179199, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.16084487736225128, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.282157897949219, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.18057161569595337, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 15.777213096618652, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.0035502146929502487, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 14.33474063873291, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.1428181231021881, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 10.091985702514648, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.10830087214708328, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 16.769580841064453, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.004057480953633785, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 9.927820205688477, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.012033109553158283, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 8.49258041381836, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.024020787328481674, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.4827961921691895, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.14246435463428497, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.589868068695068, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.07584361732006073, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 15.978815078735352, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.003252750961109996, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 14.891489028930664, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.08533839136362076, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 10.300905227661133, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.07061587274074554, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 22.563657760620117, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.007202944252640009, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 225.60824584960938, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.03303724527359009} -{"step": 1468006400, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 105.13800811767578, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.09399498999118805, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 15.965453147888184, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.003445738460868597, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 7.786408424377441, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.025462545454502106, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 7.612412452697754, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.028709962964057922, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.313694953918457, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.1372944563627243, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.275253772735596, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.16491544246673584, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 15.775015830993652, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.003304349724203348, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 14.332047462463379, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.13308030366897583, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 10.090620040893555, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.10034752637147903, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 16.785429000854492, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.0035926545970141888, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 9.987496376037598, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.012206470593810081, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 8.528753280639648, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.02233700081706047, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.483254909515381, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.11404824256896973, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.591067790985107, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.07005259394645691, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 15.977273941040039, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.002467783633619547, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 14.889127731323242, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.07679794728755951, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 10.299739837646484, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.0575130395591259, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 22.596132278442383, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.008152167312800884, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 225.86175537109375, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.03280133754014969} -{"step": 1488977920, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 105.16342163085938, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.10005809366703033, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 15.958407402038574, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.00367787410505116, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 7.786548614501953, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.026908202096819878, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 7.61171293258667, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.030603963881731033, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.306991100311279, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.14510351419448853, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.268499374389648, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.17864440381526947, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 15.773299217224121, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.00395252276211977, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 14.329984664916992, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.14322058856487274, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 10.089874267578125, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.10676195472478867, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 16.800697326660156, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.0040080430917441845, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 10.044715881347656, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.013902026228606701, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 8.563701629638672, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.02896808460354805, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.483264446258545, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.12795588374137878, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.591699600219727, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.0748615637421608, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 15.975811958312988, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.0023791883140802383, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 14.887045860290527, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.07634780555963516, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 10.29893970489502, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.056481100618839264, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 22.62710189819336, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.007677266839891672, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 226.10121154785156, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.031771473586559296} -{"step": 1509949440, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 105.18539428710938, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.10333291441202164, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 15.951111793518066, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.004084891639649868, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 7.786172389984131, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.026624659076333046, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 7.610756874084473, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.03195900097489357, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.300264835357666, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.15188072621822357, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.261721134185791, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.1826605349779129, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 15.771712303161621, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.004008061718195677, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 14.328073501586914, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.160188689827919, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 10.089116096496582, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.11520666629076004, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 16.815811157226562, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.004676456563174725, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 10.101058006286621, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.013171174563467503, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 8.59827709197998, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.029553227126598358, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.483395099639893, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.127363383769989, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.592432022094727, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.07496315240859985, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 15.974534034729004, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.0029362391214817762, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 14.88536548614502, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.07757341116666794, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 10.298563003540039, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.060260891914367676, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 22.656368255615234, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.008316384628415108, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 226.32655334472656, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.035653628408908844} -{"step": 1530920960, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 105.20462799072266, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.11396458745002747, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 15.943620681762695, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.003947051241993904, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 7.783239364624023, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.03071695566177368, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 7.6075439453125, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.04031161218881607, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.29368257522583, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.16183847188949585, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.255157470703125, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.19286823272705078, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 15.770133018493652, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.003958765882998705, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 14.325998306274414, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.15726765990257263, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 10.088160514831543, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.11602576076984406, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 16.83060073852539, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.00452496437355876, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 10.155277252197266, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.015619331039488316, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 8.631708145141602, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.032064978033304214, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.483851432800293, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.1417999118566513, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.593683242797852, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.08252529054880142, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 15.973238945007324, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.002587195485830307, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 14.883520126342773, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.08338892459869385, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 10.297889709472656, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.05904332920908928, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 22.68381118774414, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.006993582006543875, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 226.5357208251953, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.033724334090948105} -{"step": 1551892480, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 105.22181701660156, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.09962327778339386, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 15.937012672424316, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.00353817967697978, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 7.7823662757873535, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.029137583449482918, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 7.606332302093506, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.03432812914252281, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.287662982940674, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.144985631108284, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.249124526977539, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.17256025969982147, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 15.768806457519531, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.00383767276071012, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 14.324345588684082, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.14707161486148834, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 10.087503433227539, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.10862956196069717, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 16.844385147094727, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.004208668135106564, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 10.205770492553711, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.014363979920744896, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 8.662903785705566, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.028810758143663406, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.484007358551025, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.12444452941417694, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.594402313232422, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.07613570988178253, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 15.971917152404785, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.0024110598023980856, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 14.881599426269531, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.0721137598156929, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 10.297268867492676, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.05547235906124115, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 22.70953369140625, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.007785211782902479, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 226.72984313964844, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.032534707337617874} -{"step": 1572864000, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 105.23641204833984, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.10109857469797134, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 15.930439949035645, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.0032284697517752647, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 7.780671119689941, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.026077333837747574, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 7.604480743408203, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.029606075957417488, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.2819108963012695, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.13139843940734863, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.2434611320495605, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.16558316349983215, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 15.767180442810059, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.0035710991360247135, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 14.32218074798584, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.1398778110742569, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 10.086627960205078, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.10639715939760208, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 16.8573055267334, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.004671173170208931, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 10.252954483032227, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.013983513228595257, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 8.691996574401855, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.030220499262213707, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.484241962432861, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.12299881130456924, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.595282554626465, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.07502612471580505, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 15.970890045166016, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.0027419247198849916, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 14.8800630569458, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.08290151506662369, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 10.296677589416504, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.06491836160421371, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 22.73354721069336, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.008410969749093056, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 226.91127014160156, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.03750960901379585} -{"step": 1593835520, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 105.24905395507812, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.09243618696928024, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 15.9247407913208, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.003402952803298831, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 7.780597686767578, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.027866259217262268, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 7.603864669799805, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.03243987262248993, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.276237964630127, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.13709188997745514, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.237888336181641, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.17411547899246216, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 15.765706062316895, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.004149308428168297, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 14.32021713256836, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.15963229537010193, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 10.085789680480957, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.11466271430253983, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 16.869632720947266, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.004589144140481949, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 10.297027587890625, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.012768845073878765, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 8.719462394714355, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.025856906548142433, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.484678745269775, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.12170150876045227, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.596350193023682, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.07552174478769302, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 15.969552040100098, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.0023309034295380116, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 14.87802505493164, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.08125391602516174, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 10.295970916748047, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.06127782538533211, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 22.75592803955078, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.007425856776535511, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 227.07814025878906, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.03199837729334831} -{"step": 1614807040, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 105.25968933105469, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.08720488846302032, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 15.919342994689941, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.0030853801872581244, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 7.780342102050781, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.0236943569034338, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 7.60314416885376, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.026334479451179504, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.271101474761963, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.12078545242547989, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.2330098152160645, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.14991462230682373, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 15.764904022216797, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.003166456473991275, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 14.319215774536133, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.12886184453964233, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 10.085479736328125, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.09699972718954086, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 16.880807876586914, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.0038880454376339912, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 10.33747673034668, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.012098388746380806, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 8.744356155395508, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.023007355630397797, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.484589576721191, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.10679037123918533, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.596654891967773, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.06676340848207474, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 15.968751907348633, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.0031781469006091356, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 14.876687049865723, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.07571518421173096, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 10.295546531677246, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.059668585658073425, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 22.7764892578125, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.008376345969736576, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 227.23191833496094, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.032485444098711014} -{"step": 1635778560, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 105.26842498779297, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.08823104947805405, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 15.914525032043457, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.003232541959732771, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 7.7802839279174805, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.024221789091825485, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 7.6026482582092285, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.026398301124572754, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.2664971351623535, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.13261380791664124, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.228523254394531, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.1760001927614212, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 15.76407527923584, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.003781400853767991, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 14.318000793457031, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.1536746323108673, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 10.084892272949219, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.11282370239496231, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 16.891481399536133, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.004331895615905523, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 10.375948905944824, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.013307003304362297, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 8.768440246582031, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.028971334919333458, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.4844255447387695, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.13425114750862122, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.596914291381836, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.07324984669685364, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 15.967839241027832, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.0022403819020837545, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 14.875207901000977, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.07970745861530304, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 10.295034408569336, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.0615096241235733, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 22.795413970947266, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.006873424630612135, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 227.3727264404297, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.030192572623491287} -{"step": 1656750080, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 105.2761001586914, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.09502866864204407, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 15.909595489501953, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.0034009788651019335, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 7.778919219970703, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.027220258489251137, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 7.601093769073486, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.035552337765693665, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.262114524841309, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.12245047092437744, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.224283218383789, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.15305683016777039, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 15.76317024230957, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.0037068473175168037, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 14.316650390625, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.12695267796516418, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 10.084307670593262, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.09402047097682953, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 16.90140724182129, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.0039939554408192635, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 10.41135311126709, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.012534528970718384, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 8.790532112121582, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.023742742836475372, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.484342098236084, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.0997932106256485, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.597132682800293, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.06563245505094528, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 15.967400550842285, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.0018542162142693996, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 14.874377250671387, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.06695565581321716, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 10.294927597045898, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.052912887185811996, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 22.81277084350586, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.007678172085434198, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 227.50030517578125, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.02787158079445362} -{"step": 1677721600, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 105.28219604492188, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.08639080822467804, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 15.904654502868652, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.0031288324389606714, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 7.777064323425293, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.024353275075554848, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 7.599221229553223, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.026602143421769142, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.257712364196777, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.1234888881444931, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.220038414001465, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.16183429956436157, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 15.762289047241211, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.0033501870930194855, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 14.315361022949219, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.129813089966774, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 10.083780288696289, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.09725587069988251, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 16.91065216064453, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.004146835301071405, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 10.44362735748291, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.011268666945397854, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 8.810907363891602, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.022103572264313698, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.484536647796631, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.1078559160232544, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.597663879394531, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.0666109248995781, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 15.9669189453125, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.0020018797367811203, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 14.873504638671875, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.0698276162147522, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 10.294822692871094, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.052224501967430115, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 22.8287353515625, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.0071424697525799274, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 227.61651611328125, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.027346689254045486} -{"step": 1698693120, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 105.28727722167969, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.0806402787566185, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 15.900714874267578, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.0025654162745922804, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 7.776659965515137, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.023086031898856163, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 7.598613262176514, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.023670680820941925, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.253968715667725, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.11650852113962173, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.216404438018799, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.146949902176857, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 15.761457443237305, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.0031210542656481266, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 14.314085006713867, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.12148448079824448, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 10.083221435546875, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.09380887448787689, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 16.9190731048584, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.0033815335482358932, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 10.4733247756958, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.010848375037312508, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 8.829716682434082, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.020513735711574554, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.484611988067627, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.0967426598072052, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.598107814788818, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.06363040208816528, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 15.966476440429688, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.0019818495493382215, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 14.872648239135742, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.061944495886564255, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 10.29473876953125, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.04896708205342293, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 22.843271255493164, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.0075004445388913155, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 227.72265625, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.027229174971580505} -{"step": 1719664640, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 105.29138946533203, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.08171241730451584, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 15.896827697753906, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.0024689394049346447, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 7.7764363288879395, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.023841768503189087, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 7.598209381103516, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.025693215429782867, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.250263690948486, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.10577607154846191, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.212866306304932, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.1358894407749176, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 15.760834693908691, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.0028106146492064, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 14.313044548034668, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.11399710178375244, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 10.08279800415039, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.08539239317178726, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 16.926847457885742, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.0033836732618510723, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 10.500690460205078, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.010694639757275581, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 8.847094535827637, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.0225975438952446, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.484718322753906, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.09441644698381424, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.5985236167907715, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.0609835721552372, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 15.966052055358887, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.0020885756239295006, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 14.871784210205078, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.06621462851762772, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 10.29456615447998, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.052485391497612, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 22.856416702270508, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.007068919483572245, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 227.81834411621094, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.026249578222632408} -{"step": 1740636160, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 105.29458618164062, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.08337128907442093, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 15.893363952636719, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.0030440830159932375, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 7.775969505310059, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.02294907718896866, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 7.597598552703857, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.02502463012933731, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.247044086456299, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.12047680467367172, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.209742069244385, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.1426616907119751, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 15.760181427001953, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.0031611942686140537, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 14.311924934387207, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.12788020074367523, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 10.082366943359375, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.09211674332618713, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 16.93407440185547, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.003773883916437626, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 10.525712013244629, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.010401732288300991, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 8.863070487976074, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.020180808380246162, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.4848246574401855, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.09471568465232849, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.598974227905273, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.06067098677158356, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 15.965736389160156, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.002396948169916868, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 14.871097564697266, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.0630057156085968, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 10.294587135314941, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.04811820760369301, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 22.86827278137207, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.007781114894896746, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 227.903564453125, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.024671070277690887} -{"step": 1761607680, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 105.29691314697266, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.08208870887756348, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 15.8900146484375, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.002831206191331148, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 7.77478551864624, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.025325218215584755, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 7.59632682800293, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.026379575952887535, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.243955135345459, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.11255711317062378, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.206785202026367, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.14551717042922974, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 15.759732246398926, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.0031551821157336235, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 14.311108589172363, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.13018125295639038, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 10.082088470458984, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.0951998308300972, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 16.9406681060791, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.0038932878524065018, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 10.548454284667969, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.01156802661716938, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 8.877737998962402, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.021937141194939613, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.484951019287109, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.09769377112388611, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.599436283111572, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.06007656827569008, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 15.96529483795166, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.0020952681079506874, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 14.870226860046387, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.06263019889593124, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 10.294377326965332, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.04955064132809639, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 22.878829956054688, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.007914953865110874, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 227.97889709472656, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.026812614873051643} -{"step": 1782579200, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 105.29873657226562, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.07881519198417664, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 15.887334823608398, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.0027724653482437134, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 7.774448394775391, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.02477494440972805, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 7.595843315124512, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.02573251724243164, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.241291046142578, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.09827627241611481, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.204228401184082, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.12658487260341644, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 15.759180068969727, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.0028015587013214827, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 14.31023120880127, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.10991673171520233, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 10.081783294677734, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.08443328738212585, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 16.94659423828125, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.003325006226077676, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 10.568841934204102, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.011064225807785988, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 8.89098072052002, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.020735910162329674, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.485082626342773, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.08796240389347076, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.5998148918151855, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.059072863310575485, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 15.964948654174805, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.0017096219817176461, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 14.869521141052246, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.05798131972551346, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 10.29418659210205, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.0449628010392189, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 22.888164520263672, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.0077254436910152435, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 228.0458984375, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.027969643473625183} -{"step": 1803550720, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 105.30027770996094, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.079831562936306, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 15.884708404541016, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.0025981180369853973, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 7.774041652679443, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.02180561237037182, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 7.595283031463623, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.02416502870619297, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.238840579986572, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.09984404593706131, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.201894283294678, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.12630116939544678, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 15.758821487426758, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.0026889771688729525, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 14.309537887573242, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.10891421139240265, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 10.08153247833252, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.08066116273403168, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 16.951717376708984, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.003474786877632141, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 10.586230278015137, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.009754996746778488, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 8.902344703674316, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.01999577134847641, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.485100746154785, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.0843932181596756, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.599981307983398, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.054735779762268066, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 15.9647798538208, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.0016660642577335238, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 14.869093894958496, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.05504951998591423, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 10.294187545776367, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.04239635169506073, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 22.896364212036133, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.007825806736946106, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 228.1043701171875, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.025308022275567055} -{"step": 1824522240, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 105.30128479003906, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.07844646275043488, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 15.882608413696289, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.0026228094939142466, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 7.773681163787842, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.026744084432721138, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 7.594866752624512, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.03141634538769722, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.236767292022705, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.09801914542913437, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.199934005737305, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.12562192976474762, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 15.758569717407227, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.002449610736221075, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 14.30899715423584, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.10350145399570465, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 10.081401824951172, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.0798870399594307, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 16.95627212524414, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.0033941539004445076, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 10.60139274597168, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.011099044233560562, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 8.9122953414917, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.019772741943597794, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.485153675079346, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.0802309662103653, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.600198268890381, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.05535649508237839, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 15.964472770690918, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.002027407754212618, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 14.868491172790527, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.05559777468442917, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 10.294057846069336, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.043639909476041794, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 22.903533935546875, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.007822557352483273, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 228.1551055908203, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.025171836838126183} -{"step": 1845493760, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 105.30195617675781, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.08329157531261444, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 15.880581855773926, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.0024380104150623083, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 7.773349285125732, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.02461901679635048, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 7.59451961517334, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.02805447205901146, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.234805583953857, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.09931506961584091, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.198079586029053, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.1289185881614685, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 15.758419036865234, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.0026385276578366756, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 14.308586120605469, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.11356060206890106, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 10.081353187561035, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.08393847197294235, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 16.960365295410156, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.003674102947115898, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 10.614888191223145, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.012194568291306496, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 8.921170234680176, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.027131132781505585, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.485202312469482, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.09682861715555191, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.60038423538208, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.057143181562423706, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 15.964098930358887, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.0019641281105577946, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 14.867770195007324, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.06065535545349121, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 10.2938232421875, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.04861876368522644, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 22.909698486328125, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.006708559580147266, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 228.19839477539062, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.026482965797185898} -{"step": 1866465280, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 105.30243682861328, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.07296175509691238, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 15.878894805908203, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.0027314622420817614, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 7.773242473602295, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.022311167791485786, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 7.594429016113281, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.025342732667922974, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.233152866363525, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.09277227520942688, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.196506023406982, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.12198314815759659, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 15.758167266845703, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.002689122688025236, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 14.308104515075684, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.11004810780286789, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 10.081164360046387, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.08054745942354202, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 16.963964462280273, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.003258280921727419, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 10.626792907714844, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.01005256175994873, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 8.929027557373047, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.019647734239697456, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.4852776527404785, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.08607744425535202, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.600558280944824, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.0568256750702858, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 15.963995933532715, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.0019141610246151686, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 14.867417335510254, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.06168951839208603, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 10.293841361999512, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.046800658106803894, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 22.914939880371094, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.006461048498749733, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 228.2350311279297, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.02459803968667984} -{"step": 1887436800, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 105.30271911621094, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.07384952157735825, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 15.877330780029297, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.002608481328934431, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 7.772926330566406, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.02234739437699318, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 7.594069957733154, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.02384331449866295, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.231672286987305, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.09667356312274933, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.195094108581543, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.12397411465644836, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 15.75809383392334, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.0025998232886195183, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 14.307851791381836, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.11167573183774948, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 10.081101417541504, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.08112464100122452, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 16.96698760986328, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.0033949909266084433, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 10.636687278747559, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.010686291381716728, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 8.935555458068848, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.020982593297958374, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.485335826873779, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.07717349380254745, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.600717544555664, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.05528659000992775, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 15.963805198669434, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.0014784826198592782, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 14.86695384979248, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.05317794531583786, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 10.29372501373291, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.041856907308101654, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 22.91931915283203, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.007596577983349562, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 228.26568603515625, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.024858491495251656} -{"step": 1908408320, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 105.30282592773438, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.07696308940649033, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 15.876204490661621, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.002682511229068041, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 7.77277135848999, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.02091769129037857, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 7.593891620635986, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.023824229836463928, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.230549335479736, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.10001248121261597, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.194005012512207, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.120940662920475, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 15.757926940917969, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.0024901172146201134, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 14.307515144348145, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.0991939976811409, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 10.080981254577637, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.07611832767724991, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 16.969545364379883, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.003058471716940403, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 10.645039558410645, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.009786237962543964, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 8.941107749938965, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.017135022208094597, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.4854278564453125, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.08350792527198792, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.600914001464844, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.053241416811943054, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 15.96358871459961, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.0014979292172938585, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 14.866518020629883, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.04975481703877449, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 10.293586730957031, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.03840229660272598, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 22.922924041748047, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.007274145260453224, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 228.29107666015625, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.0231880284845829} -{"step": 1929379840, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 105.30290222167969, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.0730096846818924, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 15.875150680541992, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.002502662828192115, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 7.772543430328369, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.023332379758358, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 7.593636989593506, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.024055063724517822, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.2295427322387695, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.08669528365135193, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.193045139312744, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.11274195462465286, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 15.75784683227539, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.0023642699234187603, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 14.3073091506958, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.0976642295718193, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 10.080925941467285, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.07326640188694, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 16.971576690673828, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.0033341075759381056, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 10.651652336120605, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.00915449671447277, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 8.945537567138672, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.017451271414756775, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.485434532165527, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.06985777616500854, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.60098123550415, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.051441490650177, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 15.963491439819336, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.0013936541508883238, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 14.866265296936035, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.05110164359211922, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 10.293534278869629, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.03971119597554207, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 22.925817489624023, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.007051910739392042, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 228.3113250732422, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.02349466271698475} -{"step": 1950351360, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 105.30291748046875, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.0707724541425705, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 15.87427043914795, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.002190346596762538, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 7.772193431854248, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.0234768558293581, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 7.593288421630859, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.02592303417623043, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.228727340698242, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.08319129049777985, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.192270755767822, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.1026563048362732, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 15.757773399353027, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.002132824156433344, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 14.30713939666748, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.09046362340450287, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 10.080870628356934, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.06931430846452713, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 16.973142623901367, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.0029575189109891653, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 10.656719207763672, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.009044556878507137, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 8.948944091796875, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.017180508002638817, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.485435485839844, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.06919682770967484, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.601022243499756, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.048782531172037125, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 15.963508605957031, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.0013976956252008677, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 14.866164207458496, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.04900580644607544, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 10.293562889099121, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.03655010461807251, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 22.928070068359375, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.00721871480345726, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 228.3269500732422, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.02369789592921734} -{"step": 1971322880, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 105.30289459228516, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.06958898156881332, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 15.873678207397461, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.002187333069741726, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 7.772000312805176, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.021895822137594223, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 7.593086242675781, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.022924039512872696, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.228142738342285, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.08733995258808136, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.191717624664307, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.10892507433891296, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 15.757698059082031, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.0022177849896252155, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 14.306975364685059, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.08896248787641525, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 10.080814361572266, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.06873171776533127, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 16.97435760498047, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.0033591007813811302, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 10.660591125488281, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.00916688609868288, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 8.951560020446777, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.015973804518580437, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.485466003417969, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.07340007275342941, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.6010942459106445, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.0511360801756382, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 15.963470458984375, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.0014254108536988497, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 14.866022109985352, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.04841788113117218, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 10.29355525970459, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.03680079057812691, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 22.92976951599121, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.00812452845275402, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 228.33871459960938, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.023971406742930412} -{"step": 1992294400, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 105.3028335571289, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.07404487580060959, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 15.873281478881836, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.002460689516738057, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 7.771970748901367, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.024159565567970276, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 7.593040466308594, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.02852361835539341, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.2277350425720215, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.08591478317975998, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.191327095031738, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.10586654394865036, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 15.757616996765137, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.0021031207870692015, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 14.306819915771484, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.09433078020811081, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 10.080755233764648, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.06929861754179001, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 16.975252151489258, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.0033655776642262936, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 10.66341781616211, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.009133066982030869, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 8.953475952148438, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.017037283629179, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.485492706298828, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.06644731760025024, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.6011528968811035, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.047273315489292145, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 15.963423728942871, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.0012106680078431964, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 14.865898132324219, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.04657009616494179, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 10.293543815612793, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.03512854501605034, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 22.930997848510742, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.007163562346249819, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 228.34716796875, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.022615300491452217} -{"step": 2013265920, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 105.30279541015625, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.0693480372428894, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 15.87296199798584, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.0025135409086942673, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 7.771958827972412, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.023321768268942833, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 7.593018054962158, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.02331271581351757, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.227426528930664, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.08150359988212585, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.19104528427124, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.10280966013669968, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 15.757586479187012, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.0021550373639911413, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 14.306733131408691, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.09018812328577042, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 10.080731391906738, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.06716576963663101, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 16.97585678100586, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.0029819789342582226, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 10.665319442749023, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.009075620211660862, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 8.954756736755371, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.016812527552247047, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.485511779785156, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.061929646879434586, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.601193428039551, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.046941179782152176, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 15.96338176727295, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.0011924265418201685, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 14.865792274475098, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.04662300646305084, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 10.293522834777832, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.0364474393427372, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 22.931825637817383, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.007166496943682432, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 228.3528289794922, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.02307884953916073} -{"step": 2034237440, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 105.3027572631836, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.07097162306308746, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 15.872770309448242, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.0021962670143693686, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 7.771945476531982, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.02296428009867668, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 7.5929975509643555, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.02447490207850933, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.227231025695801, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.07926879823207855, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.190866470336914, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.09971334040164948, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 15.757577896118164, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.002055739751085639, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 14.30670166015625, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.08809856325387955, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 10.080726623535156, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.0661829486489296, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 16.97622299194336, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.003389052813872695, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 10.666468620300293, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.00938847940415144, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 8.955534934997559, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.016842009499669075, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.4855217933654785, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.0634622722864151, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.601213455200195, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.04676061496138573, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 15.96337604522705, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.0011218866566196084, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 14.86575698852539, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.04516759142279625, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 10.293522834777832, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.03393916040658951, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 22.932327270507812, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.00761875044554472, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 228.35630798339844, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.0230958703905344} -{"step": 2055208960, "pnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 105.30274200439453, "gnorm/_forward_module._fsdp_wrapped_module.model.embeddings.weight": 0.06862739473581314, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 15.872673988342285, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight": 0.002293655648827553, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 7.771938323974609, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.q_proj.weight": 0.022523390129208565, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 7.592988967895508, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.k_proj.weight": 0.02365426905453205, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 5.2271318435668945, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.v_proj.weight": 0.08208690583705902, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 5.190774440765381, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.attn.o_proj.weight": 0.09818429499864578, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 15.757572174072266, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight": 0.0021320278756320477, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 14.306681632995605, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.gate_proj.weight": 0.0846630409359932, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 10.080720901489258, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.0.mlp.down_proj.weight": 0.063796266913414, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 16.97641944885254, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight": 0.0029880490619689226, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 10.66707992553711, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.q_proj.weight": 0.009057550691068172, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 8.955951690673828, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.k_proj.weight": 0.015955671668052673, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 5.4855265617370605, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.v_proj.weight": 0.060953784734010696, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 5.601222515106201, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.attn.o_proj.weight": 0.045592255890369415, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 15.963364601135254, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight": 0.0012846898753196, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 14.86572265625, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.gate_proj.weight": 0.04576730728149414, "pnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 10.293514251708984, "gnorm/_forward_module._fsdp_wrapped_module.model.layers.1.mlp.down_proj.weight": 0.035104937851428986, "pnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 22.93259048461914, "gnorm/_forward_module._fsdp_wrapped_module.model.norm.weight": 0.007696523796766996, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 228.35809326171875, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.023200049996376038} +{"step": 20971520, "pnorm/_forward_module._fsdp_wrapped_module.emb.weight": 71.94264221191406, "gnorm/_forward_module._fsdp_wrapped_module.emb.weight": 0.026252994313836098, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 15.99294376373291, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 0.000844384718220681, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 5.152261257171631, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 0.001274436479434371, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 5.131396293640137, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 0.001312967506237328, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 5.151031494140625, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 0.022270189598202705, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 5.159876823425293, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 0.03446931391954422, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 15.976268768310547, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 0.0008497891249135137, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 12.593941688537598, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 0.03530651330947876, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 8.91287899017334, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 0.03939371183514595, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 16.005695343017578, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 0.001667231903411448, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 5.151484966278076, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 0.00022139660723041743, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 5.176032066345215, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 0.00024207167734857649, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 5.1712236404418945, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 0.06859473139047623, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 5.146867752075195, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 0.15639540553092957, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 16.00566291809082, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 0.0023993421345949173, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 12.625571250915527, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 0.05496597662568092, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 8.904928207397461, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 0.0710785910487175, "pnorm/_forward_module._fsdp_wrapped_module.norm.weight": 16.013927459716797, "gnorm/_forward_module._fsdp_wrapped_module.norm.weight": 0.0619436614215374, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 72.25462341308594, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.9765952229499817} +{"step": 41943040, "pnorm/_forward_module._fsdp_wrapped_module.emb.weight": 72.86009979248047, "gnorm/_forward_module._fsdp_wrapped_module.emb.weight": 0.02286859229207039, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 15.99458122253418, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 0.0012653119629248977, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 5.336243629455566, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 0.003090121317654848, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 5.317358016967773, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 0.001993304118514061, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 5.29594087600708, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 0.02028149552643299, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 5.295361518859863, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 0.02425643429160118, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 15.943232536315918, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 0.000994705711491406, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 12.881174087524414, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 0.02802559733390808, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 9.12070369720459, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 0.024117430672049522, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 16.02458381652832, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 0.0012513567926362157, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 5.326793193817139, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 0.001766734174452722, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 5.346847057342529, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 0.0018537738360464573, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 5.330268859863281, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 0.019830306991934776, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 5.303836345672607, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 0.03463108092546463, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 15.982643127441406, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 0.0006061396561563015, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 12.898591995239258, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 0.012880262918770313, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 9.104888916015625, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 0.021526290103793144, "pnorm/_forward_module._fsdp_wrapped_module.norm.weight": 16.14301300048828, "gnorm/_forward_module._fsdp_wrapped_module.norm.weight": 0.06305234134197235, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 79.9612045288086, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.967953622341156} +{"step": 62914560, "pnorm/_forward_module._fsdp_wrapped_module.emb.weight": 73.6592788696289, "gnorm/_forward_module._fsdp_wrapped_module.emb.weight": 0.1281065046787262, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 16.013975143432617, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 0.005208871327340603, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 5.55205774307251, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 0.01127499621361494, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 5.552624225616455, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 0.00852775014936924, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 5.411322593688965, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 0.09958605468273163, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 5.401479244232178, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 0.1251579076051712, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 15.942328453063965, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 0.005625483579933643, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 13.083107948303223, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 0.1706646978855133, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 9.269288063049316, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 0.13769246637821198, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 16.028589248657227, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 0.0064453925006091595, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 5.424583435058594, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 0.010032320395112038, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 5.446646213531494, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 0.007508859038352966, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 5.4212751388549805, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 0.08572456985712051, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 5.399351596832275, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 0.06853678077459335, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 15.953314781188965, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 0.0011765542440116405, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 13.063981056213379, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 0.04949614778161049, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 9.240785598754883, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 0.054528750479221344, "pnorm/_forward_module._fsdp_wrapped_module.norm.weight": 16.280672073364258, "gnorm/_forward_module._fsdp_wrapped_module.norm.weight": 0.02661321498453617, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 88.78545379638672, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.23508398234844208} +{"step": 83886080, "pnorm/_forward_module._fsdp_wrapped_module.emb.weight": 74.52867889404297, "gnorm/_forward_module._fsdp_wrapped_module.emb.weight": 0.2079300880432129, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 16.04021644592285, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 0.01141930278390646, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 5.694957256317139, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 0.0410582460463047, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 5.7021484375, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 0.02972049079835415, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 5.505371570587158, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 0.20298928022384644, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 5.486374855041504, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 0.20884926617145538, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 15.954619407653809, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 0.0101087037473917, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 13.19756031036377, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 0.22487764060497284, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 9.347999572753906, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 0.23038744926452637, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 16.019371032714844, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 0.009282797574996948, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 5.460790634155273, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 0.01896652765572071, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 5.484436511993408, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 0.024420391768217087, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 5.4511027336120605, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 0.16232074797153473, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 5.438854694366455, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 0.1259874701499939, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 15.927148818969727, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 0.003971653990447521, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 13.1610746383667, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 0.10839097946882248, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 9.307424545288086, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 0.07944659888744354, "pnorm/_forward_module._fsdp_wrapped_module.norm.weight": 16.373239517211914, "gnorm/_forward_module._fsdp_wrapped_module.norm.weight": 0.015033893287181854, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 95.11341094970703, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.1771908551454544} +{"step": 104857600, "pnorm/_forward_module._fsdp_wrapped_module.emb.weight": 75.56535339355469, "gnorm/_forward_module._fsdp_wrapped_module.emb.weight": 0.07086353003978729, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 16.056957244873047, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 0.003309188410639763, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 5.786341190338135, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 0.01372932456433773, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 5.788940906524658, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 0.011736043728888035, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 5.57407283782959, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 0.06360014528036118, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 5.552472114562988, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 0.0649266242980957, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 15.95974063873291, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 0.0032917382195591927, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 13.273491859436035, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 0.06526848673820496, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 9.408791542053223, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 0.07251127064228058, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 16.014522552490234, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 0.0031016990542411804, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 5.485093593597412, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 0.004303562454879284, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 5.506490230560303, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 0.004964713007211685, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 5.473215103149414, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 0.05916411057114601, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 5.468750476837158, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 0.04880174994468689, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 15.907085418701172, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 0.003544126870110631, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 13.230680465698242, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 0.06637071073055267, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 9.356931686401367, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 0.03799905255436897, "pnorm/_forward_module._fsdp_wrapped_module.norm.weight": 16.461294174194336, "gnorm/_forward_module._fsdp_wrapped_module.norm.weight": 0.015202079899609089, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 99.81867980957031, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.09436090290546417} +{"step": 125829120, "pnorm/_forward_module._fsdp_wrapped_module.emb.weight": 76.71443176269531, "gnorm/_forward_module._fsdp_wrapped_module.emb.weight": 0.05265946313738823, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 16.065593719482422, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 0.002253840444609523, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 5.843740940093994, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 0.011800335720181465, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 5.830393314361572, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 0.011329933069646358, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 5.63254976272583, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 0.039785344153642654, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 5.609843730926514, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 0.03926374763250351, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 15.964338302612305, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 0.0015728551661595702, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 13.339897155761719, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 0.033024873584508896, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 9.470298767089844, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 0.0419534407556057, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 16.007999420166016, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 0.001593048800714314, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 5.5041913986206055, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 0.002971832873299718, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 5.521057605743408, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 0.004481981508433819, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 5.481650352478027, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 0.030801746994256973, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 5.485519886016846, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 0.02908124029636383, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 15.889280319213867, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 0.0021285591647028923, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 13.27687931060791, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 0.03923392668366432, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 9.391510009765625, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 0.022341374307870865, "pnorm/_forward_module._fsdp_wrapped_module.norm.weight": 16.5576229095459, "gnorm/_forward_module._fsdp_wrapped_module.norm.weight": 0.019971314817667007, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 103.81121826171875, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.09770788997411728} +{"step": 146800640, "pnorm/_forward_module._fsdp_wrapped_module.emb.weight": 77.87129974365234, "gnorm/_forward_module._fsdp_wrapped_module.emb.weight": 0.0832742303609848, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 16.079103469848633, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 0.005305827595293522, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 5.931192874908447, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 0.01690182462334633, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 5.917255878448486, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 0.016395438462495804, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 5.682344913482666, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 0.10778306424617767, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 5.6604485511779785, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 0.11822457611560822, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 15.968682289123535, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 0.005742942914366722, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 13.411955833435059, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 0.09581631422042847, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 9.536697387695312, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 0.11996420472860336, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 16.002267837524414, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 0.004804317373782396, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 5.543997287750244, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 0.008212936110794544, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 5.555352687835693, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 0.00990273617208004, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 5.480627536773682, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 0.08972416818141937, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 5.489901542663574, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 0.06809289753437042, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 15.880762100219727, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 0.0053245509043335915, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 13.328607559204102, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 0.11417102068662643, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 9.426811218261719, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 0.059333767741918564, "pnorm/_forward_module._fsdp_wrapped_module.norm.weight": 16.660398483276367, "gnorm/_forward_module._fsdp_wrapped_module.norm.weight": 0.011458941735327244, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 107.48080444335938, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.09358229488134384} +{"step": 167772160, "pnorm/_forward_module._fsdp_wrapped_module.emb.weight": 78.9898910522461, "gnorm/_forward_module._fsdp_wrapped_module.emb.weight": 0.07372169196605682, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 16.093412399291992, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 0.0036807761061936617, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 6.018288612365723, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 0.016879282891750336, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 6.009598255157471, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 0.016880659386515617, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 5.725409984588623, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 0.08929280191659927, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 5.704209327697754, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 0.08960553258657455, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 15.969983100891113, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 0.003189299488440156, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 13.48346996307373, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 0.06689995527267456, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 9.599692344665527, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 0.07602671533823013, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 16.00018882751465, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 0.002583862980827689, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 5.601019859313965, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 0.005003647413104773, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 5.605584144592285, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 0.005323782097548246, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 5.481170177459717, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 0.04935716465115547, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 5.49413537979126, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 0.04710156098008156, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 15.876036643981934, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 0.0036445746663957834, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 13.384061813354492, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 0.06493222713470459, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 9.45746898651123, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 0.03675189986824989, "pnorm/_forward_module._fsdp_wrapped_module.norm.weight": 16.76885986328125, "gnorm/_forward_module._fsdp_wrapped_module.norm.weight": 0.017624806612730026, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 111.0261001586914, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.07810555398464203} +{"step": 188743680, "pnorm/_forward_module._fsdp_wrapped_module.emb.weight": 80.06210327148438, "gnorm/_forward_module._fsdp_wrapped_module.emb.weight": 0.09626239538192749, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 16.110111236572266, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 0.005547084845602512, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 6.115725517272949, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 0.02131461538374424, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 6.114412307739258, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 0.02216772735118866, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 5.765718936920166, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 0.1256685107946396, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 5.742462635040283, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 0.13927876949310303, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 15.969482421875, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 0.007090980652719736, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 13.555663108825684, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 0.11413164436817169, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 9.657185554504395, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 0.1382267326116562, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 16.000171661376953, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 0.005172365345060825, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 5.661643028259277, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 0.015529667027294636, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 5.656772136688232, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 0.02073517255485058, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 5.487007141113281, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 0.1098359078168869, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 5.502994537353516, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 0.09017720818519592, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 15.871332168579102, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 0.008534878492355347, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 13.436957359313965, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 0.1717492789030075, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 9.484776496887207, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 0.09479880332946777, "pnorm/_forward_module._fsdp_wrapped_module.norm.weight": 16.88384437561035, "gnorm/_forward_module._fsdp_wrapped_module.norm.weight": 0.029195519164204597, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 114.63328552246094, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.1428980976343155} +{"step": 209715200, "pnorm/_forward_module._fsdp_wrapped_module.emb.weight": 81.08123779296875, "gnorm/_forward_module._fsdp_wrapped_module.emb.weight": 0.11260921508073807, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 16.124292373657227, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 0.007177755702286959, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 6.20228385925293, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 0.029188372194767, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 6.200012683868408, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 0.032979946583509445, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 5.802692413330078, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 0.1582549512386322, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 5.775951862335205, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 0.15963755548000336, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 15.969592094421387, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 0.004124745726585388, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 13.632543563842773, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 0.11066038906574249, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 9.709824562072754, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 0.1014782264828682, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 15.999486923217773, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 0.003155088983476162, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 5.719771862030029, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 0.011284676380455494, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 5.700268268585205, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 0.01360571850091219, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 5.4927568435668945, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 0.0822887197136879, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 5.51140832901001, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 0.07024737447500229, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 15.868313789367676, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 0.002452629618346691, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 13.497396469116211, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 0.059435371309518814, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 9.510522842407227, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 0.03922383487224579, "pnorm/_forward_module._fsdp_wrapped_module.norm.weight": 17.00267219543457, "gnorm/_forward_module._fsdp_wrapped_module.norm.weight": 0.01741049624979496, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 118.29827880859375, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.06727750599384308} +{"step": 230686720, "pnorm/_forward_module._fsdp_wrapped_module.emb.weight": 82.05226135253906, "gnorm/_forward_module._fsdp_wrapped_module.emb.weight": 0.08196552097797394, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 16.135709762573242, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 0.004766589961946011, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 6.276165962219238, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 0.018430164083838463, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 6.270604610443115, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 0.02021835371851921, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 5.831652641296387, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 0.10870243608951569, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 5.801970958709717, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 0.11395183205604553, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 15.9718017578125, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 0.0037477186415344477, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 13.718586921691895, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 0.07895646244287491, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 9.761872291564941, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 0.08879581838846207, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 15.997980117797852, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 0.0036583298351615667, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 5.78893518447876, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 0.0083698108792305, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 5.749934196472168, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 0.010758865624666214, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 5.4944868087768555, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 0.08025974035263062, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 5.515148639678955, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 0.06901020556688309, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 15.869325637817383, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 0.006051934789866209, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 13.568707466125488, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 0.1335991621017456, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 9.548450469970703, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 0.07626014202833176, "pnorm/_forward_module._fsdp_wrapped_module.norm.weight": 17.12571144104004, "gnorm/_forward_module._fsdp_wrapped_module.norm.weight": 0.02287212759256363, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 122.0132064819336, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.10944526642560959} +{"step": 251658240, "pnorm/_forward_module._fsdp_wrapped_module.emb.weight": 82.97171020507812, "gnorm/_forward_module._fsdp_wrapped_module.emb.weight": 0.0985693484544754, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 16.14668846130371, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 0.006408585701137781, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 6.351618766784668, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 0.023454520851373672, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 6.341059684753418, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 0.029806658625602722, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 5.8550896644592285, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 0.1465402990579605, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 5.8245439529418945, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 0.1547773778438568, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 15.973357200622559, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 0.004213822539895773, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 13.801658630371094, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 0.10698152333498001, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 9.81142807006836, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 0.11242274940013885, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 16.000080108642578, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 0.0036680917255580425, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 5.8680500984191895, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 0.020172344520688057, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 5.809872150421143, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 0.023474035784602165, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 5.500040054321289, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 0.09736565500497818, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 5.523848533630371, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 0.07508581876754761, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 15.870423316955566, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 0.0033504057209938765, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 13.640886306762695, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 0.08631057292222977, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 9.58256721496582, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 0.054056599736213684, "pnorm/_forward_module._fsdp_wrapped_module.norm.weight": 17.249889373779297, "gnorm/_forward_module._fsdp_wrapped_module.norm.weight": 0.016300659626722336, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 125.68119812011719, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.07031093537807465} +{"step": 272629760, "pnorm/_forward_module._fsdp_wrapped_module.emb.weight": 83.83941650390625, "gnorm/_forward_module._fsdp_wrapped_module.emb.weight": 0.13572344183921814, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 16.157978057861328, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 0.008853629231452942, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 6.421381950378418, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 0.025652090087532997, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 6.406678199768066, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 0.03090583346784115, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 5.874969482421875, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 0.21667571365833282, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 5.843686580657959, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 0.2100018560886383, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 15.973121643066406, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 0.0054975831881165504, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 13.874876022338867, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 0.14572887122631073, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 9.8522367477417, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 0.14983515441417694, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 16.00257110595703, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 0.005637784022837877, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 5.940582275390625, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 0.018336256965994835, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 5.868223190307617, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 0.02186456322669983, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 5.506536960601807, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 0.13483382761478424, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 5.532861709594727, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 0.09268873184919357, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 15.874214172363281, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 0.0028415839187800884, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 13.716947555541992, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 0.07990167289972305, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 9.618133544921875, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 0.06095375120639801, "pnorm/_forward_module._fsdp_wrapped_module.norm.weight": 17.371002197265625, "gnorm/_forward_module._fsdp_wrapped_module.norm.weight": 0.018891258165240288, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 129.2594451904297, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.09407474845647812} +{"step": 293601280, "pnorm/_forward_module._fsdp_wrapped_module.emb.weight": 84.65815734863281, "gnorm/_forward_module._fsdp_wrapped_module.emb.weight": 0.12047797441482544, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 16.16764259338379, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 0.004903603345155716, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 6.4844536781311035, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 0.035203173756599426, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 6.466188430786133, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 0.043017417192459106, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 5.8902130126953125, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 0.11427910625934601, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 5.8582940101623535, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 0.11595597118139267, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 15.973136901855469, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 0.0030730587895959616, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 13.945425987243652, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 0.08059222996234894, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 9.890382766723633, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 0.08622752130031586, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 16.006895065307617, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 0.003237910568714142, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 6.013665676116943, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 0.01089717261493206, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 5.927155494689941, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 0.012780270539224148, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 5.5149383544921875, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 0.08076430857181549, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 5.5441718101501465, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 0.0690728947520256, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 15.87830638885498, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 0.004229381214827299, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 13.795119285583496, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 0.1009538471698761, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 9.652932167053223, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 0.06006650999188423, "pnorm/_forward_module._fsdp_wrapped_module.norm.weight": 17.492191314697266, "gnorm/_forward_module._fsdp_wrapped_module.norm.weight": 0.01322970725595951, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 132.71885681152344, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.07095605880022049} +{"step": 314572800, "pnorm/_forward_module._fsdp_wrapped_module.emb.weight": 85.43990325927734, "gnorm/_forward_module._fsdp_wrapped_module.emb.weight": 0.3178398609161377, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 16.176244735717773, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 0.010719445534050465, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 6.546968460083008, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 0.0851193219423294, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 6.525528430938721, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 0.10943160951137543, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 5.901219844818115, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 0.23237742483615875, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 5.869448661804199, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 0.21024435758590698, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 15.973027229309082, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 0.007672975305467844, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 14.012927055358887, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 0.14031417667865753, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 9.926095008850098, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 0.1255349963903427, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 16.011709213256836, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 0.005164590664207935, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 6.089405059814453, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 0.020535128191113472, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 5.987667083740234, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 0.022316325455904007, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 5.52258825302124, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 0.12407353520393372, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 5.554883003234863, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 0.0866045206785202, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 15.884645462036133, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 0.004419934470206499, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 13.878551483154297, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 0.10786059498786926, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 9.691774368286133, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 0.07099142670631409, "pnorm/_forward_module._fsdp_wrapped_module.norm.weight": 17.617366790771484, "gnorm/_forward_module._fsdp_wrapped_module.norm.weight": 0.0171599630266428, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 136.09364318847656, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.07097050547599792} +{"step": 335544320, "pnorm/_forward_module._fsdp_wrapped_module.emb.weight": 86.17813873291016, "gnorm/_forward_module._fsdp_wrapped_module.emb.weight": 0.11931154876947403, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 16.18680763244629, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 0.006120378151535988, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 6.610647678375244, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 0.029953865334391594, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 6.584476947784424, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 0.036106377840042114, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 5.913789749145508, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 0.14671553671360016, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 5.882905960083008, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 0.14643187820911407, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 15.97572135925293, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 0.0034250100143253803, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 14.078866004943848, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 0.10068259388208389, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 9.959540367126465, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 0.09524336457252502, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 16.015260696411133, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 0.0035857446491718292, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 6.156001567840576, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 0.01914539560675621, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 6.039898872375488, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 0.02316977083683014, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 5.528972148895264, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 0.09654287993907928, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 5.562602519989014, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 0.07718504220247269, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 15.88803482055664, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 0.004211448132991791, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 13.948914527893066, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 0.08975423872470856, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 9.720821380615234, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 0.05473819747567177, "pnorm/_forward_module._fsdp_wrapped_module.norm.weight": 17.732179641723633, "gnorm/_forward_module._fsdp_wrapped_module.norm.weight": 0.011058162897825241, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 139.3032989501953, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.0672445073723793} +{"step": 356515840, "pnorm/_forward_module._fsdp_wrapped_module.emb.weight": 86.88341522216797, "gnorm/_forward_module._fsdp_wrapped_module.emb.weight": 0.4946916699409485, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 16.19559669494629, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 0.013254192657768726, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 6.664097309112549, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 0.14799830317497253, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 6.634961128234863, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 0.19735008478164673, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 5.923097133636475, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 0.264306902885437, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 5.893271446228027, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 0.23252034187316895, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 15.976705551147461, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 0.008312305435538292, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 14.1381254196167, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 0.14877310395240784, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 9.988687515258789, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 0.11959146708250046, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 16.01817512512207, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 0.004985509440302849, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 6.221377849578857, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 0.04327036067843437, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 6.092094421386719, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 0.04812876135110855, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 5.532867908477783, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 0.12580955028533936, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 5.567161560058594, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 0.08359695225954056, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 15.894251823425293, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 0.004494020249694586, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 14.022294044494629, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 0.10949862748384476, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 9.753862380981445, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 0.06789027899503708, "pnorm/_forward_module._fsdp_wrapped_module.norm.weight": 17.84803009033203, "gnorm/_forward_module._fsdp_wrapped_module.norm.weight": 0.016078868880867958, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 142.3170623779297, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.09170721471309662} +{"step": 377487360, "pnorm/_forward_module._fsdp_wrapped_module.emb.weight": 87.55762481689453, "gnorm/_forward_module._fsdp_wrapped_module.emb.weight": 0.09879064559936523, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 16.199996948242188, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 0.004930524155497551, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 6.706821441650391, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 0.023350654169917107, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 6.675246238708496, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 0.029422717168927193, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 5.92694616317749, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 0.139284148812294, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 5.898825645446777, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 0.13492606580257416, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 15.977460861206055, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 0.0036365468986332417, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 14.190213203430176, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 0.09841974824666977, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 10.01545238494873, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 0.09144449234008789, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 16.024049758911133, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 0.0032758808229118586, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 6.294898986816406, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 0.014967006631195545, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 6.150996685028076, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 0.017605813220143318, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 5.537290573120117, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 0.086213119328022, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 5.572899341583252, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 0.06196281686425209, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 15.902931213378906, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 0.0028263332787901163, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 14.097919464111328, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 0.06701118499040604, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 9.788416862487793, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 0.04526611045002937, "pnorm/_forward_module._fsdp_wrapped_module.norm.weight": 17.96578598022461, "gnorm/_forward_module._fsdp_wrapped_module.norm.weight": 0.011212065815925598, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 145.20225524902344, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.0480881966650486} +{"step": 398458880, "pnorm/_forward_module._fsdp_wrapped_module.emb.weight": 88.20531463623047, "gnorm/_forward_module._fsdp_wrapped_module.emb.weight": 0.1514902114868164, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 16.20570182800293, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 0.009575305506587029, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 6.755162239074707, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 0.030629906803369522, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 6.718952178955078, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 0.035335518419742584, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 5.929346561431885, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 0.26272687315940857, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 5.902133464813232, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 0.2554568350315094, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 15.977473258972168, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 0.0055756643414497375, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 14.238914489746094, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 0.17926670610904694, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 10.039090156555176, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 0.19742098450660706, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 16.03015899658203, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 0.007348158862441778, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 6.37347412109375, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 0.016834121197462082, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 6.213879585266113, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 0.022423656657338142, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 5.54021692276001, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 0.23651675879955292, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 5.576522350311279, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 0.15455780923366547, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 15.913236618041992, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 0.013982337899506092, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 14.175613403320312, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 0.3077488839626312, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 9.827777862548828, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 0.16911807656288147, "pnorm/_forward_module._fsdp_wrapped_module.norm.weight": 18.085403442382812, "gnorm/_forward_module._fsdp_wrapped_module.norm.weight": 0.023000072687864304, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 147.96485900878906, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.11749102175235748} +{"step": 419430400, "pnorm/_forward_module._fsdp_wrapped_module.emb.weight": 88.82459259033203, "gnorm/_forward_module._fsdp_wrapped_module.emb.weight": 0.11712650209665298, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 16.212909698486328, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 0.004997153766453266, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 6.801831245422363, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 0.028621839359402657, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 6.760676860809326, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 0.03238552063703537, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 5.936990261077881, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 0.13076050579547882, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 5.910473346710205, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 0.1285037249326706, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 15.978689193725586, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 0.0036420514807105064, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 14.286320686340332, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 0.09396156668663025, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 10.062686920166016, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 0.0845579281449318, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 16.03641128540039, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 0.003269617445766926, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 6.444878101348877, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 0.015221277251839638, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 6.271318435668945, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 0.017060020938515663, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 5.545815467834473, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 0.09726972132921219, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 5.5828857421875, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 0.07181592285633087, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 15.918316841125488, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 0.0038515636697411537, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 14.240065574645996, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 0.09175842255353928, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 9.857378959655762, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 0.0621543787419796, "pnorm/_forward_module._fsdp_wrapped_module.norm.weight": 18.198360443115234, "gnorm/_forward_module._fsdp_wrapped_module.norm.weight": 0.015403669327497482, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 150.53001403808594, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.05482197180390358} +{"step": 440401920, "pnorm/_forward_module._fsdp_wrapped_module.emb.weight": 89.4228744506836, "gnorm/_forward_module._fsdp_wrapped_module.emb.weight": 0.08391167968511581, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 16.216922760009766, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 0.004185161553323269, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 6.839463710784912, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 0.020028837025165558, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 6.793783664703369, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 0.025188080966472626, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 5.938128471374512, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 0.12312031537294388, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 5.913086891174316, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 0.12127629667520523, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 15.976882934570312, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 0.0037242479156702757, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 14.323480606079102, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 0.09010065346956253, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 10.079744338989258, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 0.0855097770690918, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 16.04416275024414, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 0.003547191619873047, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 6.5219316482543945, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 0.012416060082614422, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 6.33268404006958, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 0.015814075246453285, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 5.550883769989014, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 0.09659966826438904, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 5.589540004730225, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 0.07359447330236435, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 15.928277015686035, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 0.0038679938297718763, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 14.31164836883545, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 0.09224634617567062, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 9.893924713134766, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 0.07222262024879456, "pnorm/_forward_module._fsdp_wrapped_module.norm.weight": 18.31654167175293, "gnorm/_forward_module._fsdp_wrapped_module.norm.weight": 0.008712154813110828, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 152.980712890625, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.04688451066613197} +{"step": 461373440, "pnorm/_forward_module._fsdp_wrapped_module.emb.weight": 89.99681854248047, "gnorm/_forward_module._fsdp_wrapped_module.emb.weight": 0.18393099308013916, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 16.22088623046875, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 0.007415620610117912, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 6.881353855133057, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 0.0421278141438961, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 6.829441547393799, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 0.05573078989982605, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 5.937292575836182, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 0.20642876625061035, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 5.913735866546631, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 0.19878563284873962, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 15.97535514831543, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 0.004740701988339424, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 14.358060836791992, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 0.1447451263666153, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 10.095826148986816, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 0.12467294931411743, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 16.053081512451172, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 0.004543714225292206, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 6.60038423538208, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 0.02535291761159897, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 6.394322395324707, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 0.03225056082010269, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 5.556602478027344, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 0.13851726055145264, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 5.59694766998291, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 0.09738180041313171, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 15.93692398071289, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 0.005161041859537363, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 14.378222465515137, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 0.11741092801094055, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 9.92647647857666, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 0.07550046592950821, "pnorm/_forward_module._fsdp_wrapped_module.norm.weight": 18.431392669677734, "gnorm/_forward_module._fsdp_wrapped_module.norm.weight": 0.013762586750090122, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 155.31149291992188, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.0767865777015686} +{"step": 482344960, "pnorm/_forward_module._fsdp_wrapped_module.emb.weight": 90.548828125, "gnorm/_forward_module._fsdp_wrapped_module.emb.weight": 0.13044913113117218, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 16.223146438598633, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 0.005232447292655706, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 6.916891098022461, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 0.0326094813644886, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 6.859445095062256, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 0.041687414050102234, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 5.935466766357422, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 0.1592888981103897, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 5.912483215332031, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 0.15644021332263947, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 15.972992897033691, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 0.004404603037983179, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 14.387478828430176, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 0.10470688343048096, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 10.109164237976074, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 0.09462756663560867, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 16.063392639160156, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 0.0033019250258803368, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 6.677731037139893, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 0.02049657516181469, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 6.453804969787598, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 0.024937888607382774, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 5.562489032745361, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 0.11816700547933578, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 5.605605602264404, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 0.08989624679088593, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 15.946958541870117, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 0.005483690183609724, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 14.443861961364746, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 0.1408630907535553, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 9.957887649536133, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 0.09505216777324677, "pnorm/_forward_module._fsdp_wrapped_module.norm.weight": 18.545467376708984, "gnorm/_forward_module._fsdp_wrapped_module.norm.weight": 0.006806747522205114, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 157.5385284423828, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.06928844749927521} +{"step": 503316480, "pnorm/_forward_module._fsdp_wrapped_module.emb.weight": 91.0787124633789, "gnorm/_forward_module._fsdp_wrapped_module.emb.weight": 0.15482942759990692, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 16.225324630737305, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 0.006825645454227924, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 6.951680660247803, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 0.032427895814180374, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 6.888942718505859, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 0.04058993235230446, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 5.933358669281006, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 0.20053145289421082, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 5.9107255935668945, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 0.19609995186328888, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 15.970565795898438, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 0.004544663708657026, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 14.414046287536621, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 0.14231522381305695, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 10.121010780334473, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 0.13108165562152863, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 16.073949813842773, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 0.005287831649184227, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 6.757119178771973, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 0.03142864629626274, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 6.513345241546631, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 0.04365815594792366, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 5.568297863006592, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 0.1620839387178421, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 5.613938331604004, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 0.1100650280714035, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 15.956657409667969, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 0.005169705953449011, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 14.504817008972168, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 0.14070560038089752, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 9.988649368286133, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 0.09117525815963745, "pnorm/_forward_module._fsdp_wrapped_module.norm.weight": 18.658218383789062, "gnorm/_forward_module._fsdp_wrapped_module.norm.weight": 0.016602616757154465, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 159.65496826171875, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.0700649619102478} +{"step": 524288000, "pnorm/_forward_module._fsdp_wrapped_module.emb.weight": 91.58705139160156, "gnorm/_forward_module._fsdp_wrapped_module.emb.weight": 0.17225605249404907, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 16.22798728942871, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 0.0057569993659853935, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 6.985652446746826, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 0.05004100874066353, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 6.917131423950195, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 0.06459833681583405, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 5.931731224060059, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 0.15766432881355286, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 5.9094624519348145, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 0.16137011349201202, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 15.96774673461914, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 0.004264338407665491, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 14.43817138671875, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 0.11171239614486694, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 10.131086349487305, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 0.09060262143611908, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 16.08565902709961, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 0.003619621740654111, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 6.835906028747559, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 0.03307221457362175, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 6.573214054107666, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 0.039275072515010834, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 5.574804782867432, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 0.10709687322378159, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 5.622570037841797, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 0.07956699281930923, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 15.965563774108887, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 0.003730735508725047, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 14.560507774353027, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 0.08750485628843307, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 10.016007423400879, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 0.060111045837402344, "pnorm/_forward_module._fsdp_wrapped_module.norm.weight": 18.767864227294922, "gnorm/_forward_module._fsdp_wrapped_module.norm.weight": 0.008136226795613766, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 161.6614227294922, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.05586345121264458} +{"step": 545259520, "pnorm/_forward_module._fsdp_wrapped_module.emb.weight": 92.07450866699219, "gnorm/_forward_module._fsdp_wrapped_module.emb.weight": 0.15075813233852386, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 16.229339599609375, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 0.00691896490752697, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 7.0199875831604, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 0.03018379956483841, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 6.945146560668945, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 0.03508550673723221, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 5.928421974182129, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 0.20034290850162506, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 5.9064764976501465, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 0.19745580852031708, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 15.965185165405273, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 0.005777812097221613, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 14.46104621887207, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 0.1476607769727707, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 10.140911102294922, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 0.13427744805812836, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 16.09836769104004, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 0.006634987890720367, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 6.914745330810547, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 0.027944300323724747, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 6.632318019866943, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 0.041288457810878754, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 5.582433700561523, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 0.20570607483386993, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 5.632349491119385, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 0.10697551816701889, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 15.973767280578613, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 0.005669951904565096, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 14.612305641174316, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 0.1221502274274826, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 10.042247772216797, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 0.11274224519729614, "pnorm/_forward_module._fsdp_wrapped_module.norm.weight": 18.874841690063477, "gnorm/_forward_module._fsdp_wrapped_module.norm.weight": 0.014911586418747902, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 163.55862426757812, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.08100908994674683} +{"step": 566231040, "pnorm/_forward_module._fsdp_wrapped_module.emb.weight": 92.54312896728516, "gnorm/_forward_module._fsdp_wrapped_module.emb.weight": 0.09311813116073608, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 16.23174476623535, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 0.004505771212279797, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 7.052672863006592, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 0.02304401807487011, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 6.972209453582764, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 0.027856023982167244, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 5.926371097564697, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 0.13804800808429718, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 5.904661178588867, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 0.14332617819309235, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 15.962691307067871, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 0.0031494051218032837, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 14.48305892944336, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 0.10206375271081924, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 10.149641036987305, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 0.08141127973794937, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 16.110715866088867, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 0.00278551341034472, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 6.99580192565918, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 0.012446424923837185, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 6.692395210266113, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 0.017131507396697998, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 5.5887298583984375, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 0.09492745995521545, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 5.639759063720703, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 0.0735788568854332, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 15.980902671813965, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 0.0037464050110429525, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 14.661474227905273, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 0.0859096422791481, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 10.065576553344727, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 0.06543131917715073, "pnorm/_forward_module._fsdp_wrapped_module.norm.weight": 18.97650909423828, "gnorm/_forward_module._fsdp_wrapped_module.norm.weight": 0.007831182330846786, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 165.36285400390625, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.04706459492444992} +{"step": 587202560, "pnorm/_forward_module._fsdp_wrapped_module.emb.weight": 92.99466705322266, "gnorm/_forward_module._fsdp_wrapped_module.emb.weight": 0.12662021815776825, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 16.23180389404297, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 0.005261004436761141, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 7.081130027770996, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 0.031018510460853577, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 6.996013641357422, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 0.04222097992897034, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 5.921595573425293, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 0.1598033756017685, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 5.900201320648193, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 0.1582438349723816, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 15.958269119262695, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 0.004107198677957058, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 14.500168800354004, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 0.12032666802406311, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 10.155326843261719, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 0.09838780760765076, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 16.12537956237793, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 0.00387371564283967, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 7.083957672119141, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 0.01677710749208927, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 6.757487773895264, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 0.02495959959924221, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 5.595648288726807, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 0.12845444679260254, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 5.648943901062012, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 0.08615271002054214, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 15.9894437789917, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 0.004078408237546682, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 14.709073066711426, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 0.09053977578878403, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 10.090377807617188, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 0.06650441139936447, "pnorm/_forward_module._fsdp_wrapped_module.norm.weight": 19.08208465576172, "gnorm/_forward_module._fsdp_wrapped_module.norm.weight": 0.011222833767533302, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 167.09292602539062, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.05993615463376045} +{"step": 608174080, "pnorm/_forward_module._fsdp_wrapped_module.emb.weight": 93.42768859863281, "gnorm/_forward_module._fsdp_wrapped_module.emb.weight": 0.19094841182231903, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 16.232418060302734, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 0.008451608009636402, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 7.111935138702393, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 0.04345696419477463, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 7.021318435668945, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 0.05591581016778946, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 5.917372703552246, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 0.25816720724105835, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 5.896277904510498, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 0.2314184010028839, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 15.953652381896973, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 0.005206046625971794, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 14.51641845703125, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 0.15580213069915771, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 10.160825729370117, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 0.13534703850746155, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 16.139427185058594, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 0.005027914419770241, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 7.167648792266846, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 0.02901388332247734, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 6.817709922790527, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 0.0426316112279892, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 5.6021270751953125, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 0.1754629909992218, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 5.657379150390625, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 0.12667489051818848, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 15.99716567993164, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 0.009387231431901455, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 14.753622055053711, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 0.20426307618618011, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 10.114582061767578, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 0.12163450568914413, "pnorm/_forward_module._fsdp_wrapped_module.norm.weight": 19.188594818115234, "gnorm/_forward_module._fsdp_wrapped_module.norm.weight": 0.01679312437772751, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 168.7510528564453, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.0936063751578331} +{"step": 629145600, "pnorm/_forward_module._fsdp_wrapped_module.emb.weight": 93.84310913085938, "gnorm/_forward_module._fsdp_wrapped_module.emb.weight": 0.10595779865980148, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 16.23512077331543, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 0.004970278590917587, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 7.143996715545654, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 0.022588005289435387, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 7.04677677154541, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 0.029694601893424988, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 5.916067123413086, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 0.15821990370750427, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 5.894464492797852, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 0.15721826255321503, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 15.951798439025879, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 0.00360694108530879, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 14.53537654876709, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 0.11739635467529297, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 10.168502807617188, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 0.09823963791131973, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 16.15316390991211, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 0.003530417336151004, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 7.252838611602783, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 0.013136184774339199, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 6.877594470977783, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 0.021755294874310493, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 5.608679294586182, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 0.12392225116491318, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 5.664714336395264, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 0.08810354769229889, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 16.00156021118164, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 0.0035967256408184767, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 14.791335105895996, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 0.08995746821165085, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 10.131881713867188, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 0.0639408677816391, "pnorm/_forward_module._fsdp_wrapped_module.norm.weight": 19.28299331665039, "gnorm/_forward_module._fsdp_wrapped_module.norm.weight": 0.007006132509559393, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 170.302001953125, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.05371110141277313} +{"step": 650117120, "pnorm/_forward_module._fsdp_wrapped_module.emb.weight": 94.2413558959961, "gnorm/_forward_module._fsdp_wrapped_module.emb.weight": 0.10382077097892761, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 16.23381805419922, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 0.004374297801405191, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 7.165363788604736, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 0.025081081315875053, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 7.065510272979736, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 0.032148998230695724, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 5.911626815795898, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 0.14122778177261353, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 5.889969348907471, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 0.14172284305095673, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 15.946556091308594, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 0.003207521280273795, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 14.546817779541016, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 0.10718733817338943, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 10.171223640441895, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 0.08256553113460541, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 16.167591094970703, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 0.0035583048593252897, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 7.339625835418701, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 0.013714350759983063, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 6.938138961791992, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 0.023259621113538742, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 5.614846706390381, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 0.10635057836771011, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 5.67283821105957, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 0.07349138706922531, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 16.008880615234375, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 0.0031888207886368036, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 14.831485748291016, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 0.07875405997037888, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 10.152565002441406, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 0.05492309853434563, "pnorm/_forward_module._fsdp_wrapped_module.norm.weight": 19.38007164001465, "gnorm/_forward_module._fsdp_wrapped_module.norm.weight": 0.009448260068893433, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 171.78060913085938, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.04928857833147049} +{"step": 671088640, "pnorm/_forward_module._fsdp_wrapped_module.emb.weight": 94.62193298339844, "gnorm/_forward_module._fsdp_wrapped_module.emb.weight": 0.12983185052871704, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 16.232818603515625, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 0.006325278431177139, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 7.192888259887695, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 0.021826716139912605, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 7.087399005889893, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 0.027498189359903336, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 5.905391693115234, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 0.2242937833070755, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 5.883708953857422, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 0.2042037546634674, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 15.942424774169922, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 0.0045170411467552185, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 14.559067726135254, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 0.14943474531173706, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 10.175202369689941, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 0.12161900848150253, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 16.18353843688965, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 0.004915027413517237, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 7.429213523864746, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 0.021640177816152573, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 7.001055717468262, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 0.034694913774728775, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 5.621705055236816, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 0.1654706746339798, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 5.681648254394531, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 0.10519864410161972, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 16.015548706054688, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 0.003718830179423094, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 14.868739128112793, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 0.10001227259635925, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 10.171826362609863, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 0.06961340457201004, "pnorm/_forward_module._fsdp_wrapped_module.norm.weight": 19.479305267333984, "gnorm/_forward_module._fsdp_wrapped_module.norm.weight": 0.011101193726062775, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 173.2014923095703, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.06101597473025322} +{"step": 692060160, "pnorm/_forward_module._fsdp_wrapped_module.emb.weight": 94.98848724365234, "gnorm/_forward_module._fsdp_wrapped_module.emb.weight": 0.1275801807641983, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 16.23150634765625, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 0.00539840292185545, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 7.216818809509277, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 0.03970112279057503, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 7.107250213623047, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 0.050824400037527084, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 5.898680686950684, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 0.16321253776550293, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 5.876818656921387, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 0.16125506162643433, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 15.938138008117676, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 0.0036688209511339664, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 14.57021713256836, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 0.11898025125265121, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 10.178980827331543, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 0.09235744923353195, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 16.19951820373535, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 0.003694143146276474, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 7.514278411865234, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 0.015459626913070679, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 7.059179782867432, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 0.02395905926823616, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 5.628508567810059, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 0.11917136609554291, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 5.6902289390563965, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 0.09001897275447845, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 16.02215576171875, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 0.0040156543254852295, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 14.904047966003418, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 0.10562583059072495, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 10.189802169799805, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 0.0754232406616211, "pnorm/_forward_module._fsdp_wrapped_module.norm.weight": 19.576566696166992, "gnorm/_forward_module._fsdp_wrapped_module.norm.weight": 0.00649035582318902, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 174.55824279785156, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.0422859787940979} +{"step": 713031680, "pnorm/_forward_module._fsdp_wrapped_module.emb.weight": 95.33937072753906, "gnorm/_forward_module._fsdp_wrapped_module.emb.weight": 0.11759142577648163, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 16.230520248413086, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 0.005111567676067352, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 7.240697383880615, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 0.02219490148127079, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 7.126221179962158, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 0.027795638889074326, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 5.893097400665283, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 0.16904066503047943, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 5.8708696365356445, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 0.16757701337337494, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 15.933444023132324, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 0.003600281896069646, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 14.579181671142578, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 0.12517838180065155, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 10.181953430175781, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 0.0989326685667038, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 16.214818954467773, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 0.004031594842672348, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 7.592922687530518, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 0.014079410582780838, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 7.11325216293335, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 0.022097961977124214, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 5.63470458984375, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 0.12983879446983337, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 5.69867467880249, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 0.09466271102428436, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 16.0277156829834, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 0.004779823124408722, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 14.93511962890625, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 0.11762966960668564, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 10.205894470214844, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 0.08747638761997223, "pnorm/_forward_module._fsdp_wrapped_module.norm.weight": 19.670211791992188, "gnorm/_forward_module._fsdp_wrapped_module.norm.weight": 0.0053726676851511, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 175.85421752929688, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.05434586852788925} +{"step": 734003200, "pnorm/_forward_module._fsdp_wrapped_module.emb.weight": 95.67573547363281, "gnorm/_forward_module._fsdp_wrapped_module.emb.weight": 0.09394030272960663, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 16.22826385498047, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 0.004197848495095968, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 7.262969970703125, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 0.01859707571566105, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 7.144403457641602, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 0.02430582046508789, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 5.8859028816223145, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 0.14321987330913544, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 5.863649845123291, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 0.13572491705417633, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 15.929401397705078, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 0.002638125093653798, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 14.588912963867188, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 0.09458004683256149, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 10.184822082519531, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 0.0769941434264183, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 16.231956481933594, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 0.0029524685814976692, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 7.676023006439209, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 0.010039342567324638, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 7.1703667640686035, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 0.01655508205294609, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 5.640932559967041, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 0.09817222505807877, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 5.706981658935547, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 0.07407516986131668, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 16.032865524291992, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 0.0035392853897064924, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 14.963860511779785, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 0.08533716201782227, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 10.22107982635498, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 0.06554237753152847, "pnorm/_forward_module._fsdp_wrapped_module.norm.weight": 19.76414680480957, "gnorm/_forward_module._fsdp_wrapped_module.norm.weight": 0.009063346311450005, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 177.09584045410156, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.04822414368391037} +{"step": 754974720, "pnorm/_forward_module._fsdp_wrapped_module.emb.weight": 95.998291015625, "gnorm/_forward_module._fsdp_wrapped_module.emb.weight": 0.13088364899158478, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 16.227014541625977, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 0.005570572800934315, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 7.285639762878418, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 0.023961329832673073, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 7.162644863128662, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 0.029919708147644997, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 5.8799920082092285, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 0.19090348482131958, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 5.857325077056885, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 0.17464172840118408, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 15.925089836120605, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 0.003984976559877396, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 14.598137855529785, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 0.12315371632575989, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 10.187389373779297, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 0.09965420514345169, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 16.24838638305664, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 0.003382632276043296, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 7.757114887237549, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 0.014483612962067127, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 7.225400924682617, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 0.022902727127075195, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 5.647815227508545, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 0.1224302351474762, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 5.7153825759887695, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 0.09235640615224838, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 16.036941528320312, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 0.003440422937273979, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 14.990671157836914, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 0.08920009434223175, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 10.234676361083984, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 0.06489621847867966, "pnorm/_forward_module._fsdp_wrapped_module.norm.weight": 19.852766036987305, "gnorm/_forward_module._fsdp_wrapped_module.norm.weight": 0.00906653143465519, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 178.27532958984375, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.048468396067619324} +{"step": 775946240, "pnorm/_forward_module._fsdp_wrapped_module.emb.weight": 96.306640625, "gnorm/_forward_module._fsdp_wrapped_module.emb.weight": 0.1019570529460907, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 16.22580337524414, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 0.004563652910292149, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 7.308208465576172, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 0.020241189748048782, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 7.1812663078308105, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 0.02849896252155304, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 5.874294281005859, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 0.15498197078704834, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 5.85123348236084, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 0.13935565948486328, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 15.920550346374512, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 0.0031827993225306273, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 14.606101036071777, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 0.10716540366411209, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 10.189397811889648, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 0.08026865124702454, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 16.26432991027832, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 0.003183528082445264, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 7.836119651794434, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 0.013145992532372475, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 7.279488563537598, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 0.02099030092358589, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 5.653266429901123, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 0.10179790109395981, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 5.723034858703613, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 0.07350282371044159, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 16.041902542114258, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 0.003054909873753786, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 15.018330574035645, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 0.07494408637285233, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 10.249434471130371, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 0.055476732552051544, "pnorm/_forward_module._fsdp_wrapped_module.norm.weight": 19.940675735473633, "gnorm/_forward_module._fsdp_wrapped_module.norm.weight": 0.006819596514105797, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 179.40139770507812, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.03878428786993027} +{"step": 796917760, "pnorm/_forward_module._fsdp_wrapped_module.emb.weight": 96.60041809082031, "gnorm/_forward_module._fsdp_wrapped_module.emb.weight": 0.10631448775529861, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 16.22406005859375, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 0.005395122338086367, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 7.330290794372559, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 0.020021842792630196, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 7.199182510375977, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 0.027250193059444427, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 5.867520809173584, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 0.17859485745429993, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 5.843725204467773, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 0.16585373878479004, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 15.915215492248535, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 0.0037507738452404737, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 14.611644744873047, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 0.11664588004350662, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 10.190326690673828, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 0.08809297531843185, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 16.282115936279297, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 0.0036110375076532364, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 7.917365550994873, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 0.010339113883674145, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 7.335137367248535, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 0.01653970405459404, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 5.659985065460205, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 0.12007098644971848, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 5.731705665588379, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 0.08134934306144714, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 16.046648025512695, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 0.003633325919508934, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 15.043209075927734, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 0.09322387725114822, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 10.263504981994629, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 0.07070150971412659, "pnorm/_forward_module._fsdp_wrapped_module.norm.weight": 20.02789306640625, "gnorm/_forward_module._fsdp_wrapped_module.norm.weight": 0.009489250369369984, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 180.48353576660156, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.04943721368908882} +{"step": 817889280, "pnorm/_forward_module._fsdp_wrapped_module.emb.weight": 96.87996673583984, "gnorm/_forward_module._fsdp_wrapped_module.emb.weight": 0.13767629861831665, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 16.220216751098633, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 0.006220632698386908, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 7.347241401672363, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 0.029035167768597603, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 7.213191509246826, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 0.03837667778134346, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 5.859448432922363, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 0.19704455137252808, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 5.834871292114258, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 0.20417886972427368, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 15.911145210266113, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 0.004477074835449457, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 14.619013786315918, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 0.15552102029323578, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 10.192142486572266, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 0.1233123168349266, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 16.29975700378418, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 0.005072740837931633, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 7.996663570404053, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 0.024740593507885933, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 7.38875150680542, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 0.044429440051317215, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 5.666155815124512, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 0.18392515182495117, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 5.740313529968262, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 0.11512491852045059, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 16.051010131835938, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 0.0077929189428687096, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 15.067505836486816, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 0.18878188729286194, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 10.276771545410156, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 0.12294253706932068, "pnorm/_forward_module._fsdp_wrapped_module.norm.weight": 20.11403465270996, "gnorm/_forward_module._fsdp_wrapped_module.norm.weight": 0.007344083394855261, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 181.52134704589844, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.07733167707920074} +{"step": 838860800, "pnorm/_forward_module._fsdp_wrapped_module.emb.weight": 97.1453857421875, "gnorm/_forward_module._fsdp_wrapped_module.emb.weight": 0.10234362632036209, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 16.216928482055664, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 0.0046647475101053715, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 7.362961292266846, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 0.021255213767290115, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 7.226145267486572, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 0.0284762904047966, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 5.851893901824951, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 0.1603088080883026, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 5.826932430267334, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 0.16819250583648682, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 15.905652046203613, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 0.004169578664004803, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 14.623359680175781, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 0.134686678647995, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 10.192557334899902, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 0.10064878314733505, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 16.319673538208008, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 0.00410562613978982, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 8.080368995666504, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 0.014838915318250656, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 7.444242477416992, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 0.025548772886395454, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 5.673666000366211, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 0.13260751962661743, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 5.750917434692383, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 0.09520163387060165, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 16.0540828704834, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 0.00317081599496305, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 15.087262153625488, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 0.09304875135421753, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 10.287644386291504, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 0.0665452629327774, "pnorm/_forward_module._fsdp_wrapped_module.norm.weight": 20.19620704650879, "gnorm/_forward_module._fsdp_wrapped_module.norm.weight": 0.009802504442632198, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 182.50436401367188, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.04795417562127113} +{"step": 859832320, "pnorm/_forward_module._fsdp_wrapped_module.emb.weight": 97.39765930175781, "gnorm/_forward_module._fsdp_wrapped_module.emb.weight": 0.1212027296423912, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 16.21510124206543, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 0.0050684306770563126, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 7.3850579261779785, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 0.023021679371595383, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 7.244828224182129, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 0.02948431484401226, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 5.8447184562683105, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 0.18169371783733368, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 5.8193135261535645, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 0.18736666440963745, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 15.900679588317871, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 0.00407248642295599, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 14.62826919555664, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 0.1400635987520218, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 10.193181991577148, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 0.10813773423433304, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 16.337749481201172, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 0.004635254852473736, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 8.15839958190918, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 0.01907477341592312, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 7.496582984924316, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 0.03347678855061531, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 5.680220127105713, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 0.15184316039085388, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 5.759598255157471, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 0.09839041531085968, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 16.057111740112305, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 0.004323484376072884, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 15.10583209991455, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 0.11095236986875534, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 10.298421859741211, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 0.08085902035236359, "pnorm/_forward_module._fsdp_wrapped_module.norm.weight": 20.276960372924805, "gnorm/_forward_module._fsdp_wrapped_module.norm.weight": 0.010553679428994656, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 183.44471740722656, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.05524995177984238} +{"step": 880803840, "pnorm/_forward_module._fsdp_wrapped_module.emb.weight": 97.63582611083984, "gnorm/_forward_module._fsdp_wrapped_module.emb.weight": 0.1302792876958847, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 16.211782455444336, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 0.004983327817171812, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 7.400809288024902, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 0.030652543529868126, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 7.257268905639648, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 0.04345262050628662, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 5.837380409240723, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 0.16729636490345, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 5.811247825622559, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 0.1773761361837387, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 15.896677017211914, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 0.0038491245359182358, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 14.633590698242188, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 0.13402381539344788, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 10.193913459777832, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 0.10411501675844193, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 16.355709075927734, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 0.004817499313503504, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 8.234691619873047, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 0.015435583889484406, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 7.547577381134033, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 0.027564184740185738, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 5.686028480529785, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 0.13794860243797302, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 5.767368316650391, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 0.08869976550340652, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 16.060588836669922, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 0.003447948256507516, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 15.125040054321289, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 0.09228742867708206, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 10.309091567993164, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 0.06809671223163605, "pnorm/_forward_module._fsdp_wrapped_module.norm.weight": 20.35704803466797, "gnorm/_forward_module._fsdp_wrapped_module.norm.weight": 0.008599378168582916, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 184.34555053710938, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.04625782370567322} +{"step": 901775360, "pnorm/_forward_module._fsdp_wrapped_module.emb.weight": 97.86257934570312, "gnorm/_forward_module._fsdp_wrapped_module.emb.weight": 0.1268908530473709, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 16.208724975585938, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 0.005689583718776703, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 7.416347980499268, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 0.02403145469725132, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 7.269993782043457, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 0.032574668526649475, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 5.830732822418213, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 0.21000751852989197, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 5.8040056228637695, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 0.19244424998760223, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 15.891785621643066, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 0.003814539173617959, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 14.636624336242676, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 0.14679335057735443, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 10.193835258483887, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 0.10765157639980316, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 16.373044967651367, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 0.004450382199138403, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 8.308414459228516, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 0.014683425426483154, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 7.596048831939697, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 0.02597920410335064, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 5.691842079162598, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 0.1485251635313034, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 5.774977684020996, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 0.10095543414354324, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 16.063323974609375, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 0.005270408932119608, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 15.140955924987793, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 0.1287379413843155, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 10.317961692810059, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 0.08620213717222214, "pnorm/_forward_module._fsdp_wrapped_module.norm.weight": 20.434444427490234, "gnorm/_forward_module._fsdp_wrapped_module.norm.weight": 0.010419826954603195, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 185.19871520996094, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.054722703993320465} +{"step": 922746880, "pnorm/_forward_module._fsdp_wrapped_module.emb.weight": 98.07791137695312, "gnorm/_forward_module._fsdp_wrapped_module.emb.weight": 0.1155291274189949, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 16.20656394958496, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 0.00497082294896245, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 7.4333086013793945, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 0.022036660462617874, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 7.284237384796143, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 0.029442913830280304, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 5.824109077453613, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 0.17837117612361908, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 5.796790599822998, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 0.177406907081604, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 15.887226104736328, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 0.00335360923781991, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 14.639330863952637, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 0.13146744668483734, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 10.19327449798584, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 0.10547453165054321, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 16.3902530670166, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 0.004569668788462877, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 8.38241195678711, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 0.012643421068787575, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 7.6449761390686035, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 0.02389386110007763, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 5.696677207946777, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 0.1434052437543869, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 5.781522750854492, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 0.09994996339082718, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 16.06623649597168, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 0.006531578954309225, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 15.158130645751953, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 0.1458556205034256, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 10.327372550964355, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 0.10065927356481552, "pnorm/_forward_module._fsdp_wrapped_module.norm.weight": 20.50840187072754, "gnorm/_forward_module._fsdp_wrapped_module.norm.weight": 0.00654254388064146, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 186.01577758789062, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.06040726602077484} +{"step": 943718400, "pnorm/_forward_module._fsdp_wrapped_module.emb.weight": 98.28111267089844, "gnorm/_forward_module._fsdp_wrapped_module.emb.weight": 0.12471514940261841, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 16.20271110534668, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 0.0052994610741734505, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 7.447169780731201, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 0.02525326982140541, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 7.2958550453186035, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 0.03427357226610184, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 5.816053867340088, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 0.18370148539543152, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 5.788259029388428, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 0.1796383112668991, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 15.882843971252441, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 0.004238704219460487, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 14.641569137573242, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 0.13498854637145996, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 10.192780494689941, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 0.10036492347717285, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 16.407766342163086, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 0.003960480913519859, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 8.453996658325195, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 0.015059561468660831, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 7.692646026611328, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 0.028543902561068535, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 5.701420307159424, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 0.13034959137439728, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 5.788503170013428, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 0.08846735209226608, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 16.069175720214844, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 0.0038180712144821882, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 15.173663139343262, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 0.09623953700065613, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 10.336935043334961, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 0.07107062637805939, "pnorm/_forward_module._fsdp_wrapped_module.norm.weight": 20.583221435546875, "gnorm/_forward_module._fsdp_wrapped_module.norm.weight": 0.005771055817604065, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 186.7968292236328, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.051455456763505936} +{"step": 964689920, "pnorm/_forward_module._fsdp_wrapped_module.emb.weight": 98.47245788574219, "gnorm/_forward_module._fsdp_wrapped_module.emb.weight": 0.13261501491069794, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 16.19887351989746, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 0.005914230830967426, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 7.459512233734131, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 0.02466682530939579, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 7.305739879608154, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 0.03527207672595978, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 5.807804107666016, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 0.1995498538017273, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 5.7793731689453125, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 0.2098565548658371, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 15.879356384277344, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 0.004989039618521929, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 14.644309043884277, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 0.1678691804409027, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 10.192694664001465, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 0.12005241215229034, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 16.424959182739258, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 0.004721946083009243, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 8.522622108459473, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 0.016670292243361473, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 7.737500190734863, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 0.032933373004198074, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 5.706663131713867, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 0.1603933721780777, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 5.795518398284912, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 0.10836854577064514, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 16.071674346923828, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 0.004487736150622368, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 15.187721252441406, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 0.11726914346218109, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 10.345362663269043, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 0.08179665356874466, "pnorm/_forward_module._fsdp_wrapped_module.norm.weight": 20.654821395874023, "gnorm/_forward_module._fsdp_wrapped_module.norm.weight": 0.00885434914380312, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 187.5422821044922, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.05596044659614563} +{"step": 985661440, "pnorm/_forward_module._fsdp_wrapped_module.emb.weight": 98.65229797363281, "gnorm/_forward_module._fsdp_wrapped_module.emb.weight": 0.1004035621881485, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 16.195804595947266, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 0.004363427869975567, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 7.473291397094727, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 0.01832820661365986, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 7.316906929016113, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 0.023538218811154366, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 5.800487995147705, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 0.15794844925403595, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 5.77152156829834, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 0.16434521973133087, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 15.874800682067871, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 0.003844300052151084, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 14.644891738891602, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 0.12103970348834991, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 10.191058158874512, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 0.09276780486106873, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 16.44251251220703, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 0.003936935681849718, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 8.590960502624512, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 0.012559405528008938, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 7.781299114227295, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 0.021905720233917236, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 5.712394714355469, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 0.11809826642274857, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 5.803104400634766, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 0.08554702252149582, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 16.073532104492188, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 0.0037536693271249533, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 15.199711799621582, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 0.09647910296916962, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 10.352608680725098, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 0.06758711487054825, "pnorm/_forward_module._fsdp_wrapped_module.norm.weight": 20.725391387939453, "gnorm/_forward_module._fsdp_wrapped_module.norm.weight": 0.004477429203689098, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 188.2526397705078, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.04246075823903084} +{"step": 1006632960, "pnorm/_forward_module._fsdp_wrapped_module.emb.weight": 98.82089233398438, "gnorm/_forward_module._fsdp_wrapped_module.emb.weight": 0.15528631210327148, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 16.19056510925293, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 0.006022888235747814, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 7.48045539855957, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 0.02907666191458702, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 7.322803497314453, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 0.03921954333782196, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 5.791895389556885, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 0.22771666944026947, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 5.762531280517578, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 0.24415770173072815, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 15.870258331298828, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 0.006169500760734081, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 14.645258903503418, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 0.18308919668197632, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 10.189537048339844, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 0.14503131806850433, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 16.461198806762695, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 0.00639282027259469, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 8.659329414367676, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 0.0339343287050724, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 7.8256516456604, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 0.06308723986148834, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 5.718966007232666, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 0.2143942266702652, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 5.81181526184082, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 0.1396917700767517, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 16.076108932495117, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 0.010179542005062103, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 15.213021278381348, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 0.23303239047527313, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 10.361063003540039, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 0.15677113831043243, "pnorm/_forward_module._fsdp_wrapped_module.norm.weight": 20.795162200927734, "gnorm/_forward_module._fsdp_wrapped_module.norm.weight": 0.007820487953722477, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 188.9204559326172, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.08688591420650482} +{"step": 1027604480, "pnorm/_forward_module._fsdp_wrapped_module.emb.weight": 98.9798812866211, "gnorm/_forward_module._fsdp_wrapped_module.emb.weight": 0.09010152518749237, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 16.186643600463867, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 0.003212433774024248, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 7.493265628814697, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 0.020784998312592506, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 7.332517623901367, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 0.027287926524877548, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 5.783655643463135, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 0.12321101874113083, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 5.7540435791015625, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 0.13429614901542664, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 15.866738319396973, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 0.002835871884599328, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 14.64593505859375, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 0.10275924205780029, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 10.188344955444336, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 0.07957761734724045, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 16.478893280029297, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 0.0030809124000370502, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 8.725595474243164, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 0.012478345073759556, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 7.868503093719482, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 0.020445138216018677, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 5.724573135375977, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 0.09659238159656525, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 5.819666385650635, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 0.06639868021011353, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 16.0768985748291, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 0.002157519105821848, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 15.22175407409668, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 0.06640730053186417, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 10.366741180419922, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 0.05058366432785988, "pnorm/_forward_module._fsdp_wrapped_module.norm.weight": 20.85984992980957, "gnorm/_forward_module._fsdp_wrapped_module.norm.weight": 0.006298630498349667, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 189.5593719482422, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.03538743779063225} +{"step": 1048576000, "pnorm/_forward_module._fsdp_wrapped_module.emb.weight": 99.12769317626953, "gnorm/_forward_module._fsdp_wrapped_module.emb.weight": 0.10651274025440216, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 16.182802200317383, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 0.004144119098782539, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 7.506289005279541, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 0.019939787685871124, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 7.343016147613525, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 0.027321722358465195, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 5.775634765625, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 0.15168210864067078, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 5.74553918838501, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 0.16195929050445557, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 15.862722396850586, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 0.0037371101789176464, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 14.645999908447266, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 0.12619557976722717, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 10.186921119689941, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 0.0944753885269165, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 16.49630355834961, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 0.0035916161723434925, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 8.79088020324707, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 0.01489830669015646, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 7.911147594451904, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 0.030575867742300034, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 5.7296061515808105, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 0.11752719432115555, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 5.826395034790039, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 0.08088203519582748, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 16.07961654663086, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 0.003169654170051217, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 15.23426342010498, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 0.08458977192640305, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 10.374101638793945, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 0.06459670513868332, "pnorm/_forward_module._fsdp_wrapped_module.norm.weight": 20.924589157104492, "gnorm/_forward_module._fsdp_wrapped_module.norm.weight": 0.0065938448533415794, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 190.175048828125, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.04750976711511612} +{"step": 1069547520, "pnorm/_forward_module._fsdp_wrapped_module.emb.weight": 99.26629638671875, "gnorm/_forward_module._fsdp_wrapped_module.emb.weight": 0.14057929813861847, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 16.17868423461914, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 0.0056366752833127975, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 7.518093585968018, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 0.031006138771772385, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 7.352963447570801, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 0.0440651997923851, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 5.767252445220947, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 0.20459504425525665, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 5.736786842346191, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 0.20889262855052948, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 15.859658241271973, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 0.004766595549881458, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 14.64697265625, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 0.1600532978773117, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 10.186137199401855, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 0.11538305878639221, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 16.51382064819336, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 0.0050522033125162125, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 8.856047630310059, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 0.017282402142882347, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 7.953855514526367, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 0.0327884666621685, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 5.734170913696289, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 0.147215336561203, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 5.832700252532959, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 0.10176074504852295, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 16.081466674804688, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 0.004545057658106089, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 15.243746757507324, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 0.11357424408197403, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 10.38032341003418, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 0.0786777064204216, "pnorm/_forward_module._fsdp_wrapped_module.norm.weight": 20.989078521728516, "gnorm/_forward_module._fsdp_wrapped_module.norm.weight": 0.007650890853255987, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 190.76580810546875, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.051645126193761826} +{"step": 1090519040, "pnorm/_forward_module._fsdp_wrapped_module.emb.weight": 99.39591217041016, "gnorm/_forward_module._fsdp_wrapped_module.emb.weight": 0.11911141127347946, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 16.173725128173828, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 0.00496203126385808, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 7.525610446929932, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 0.025073682889342308, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 7.358611583709717, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 0.03250865638256073, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 5.758661270141602, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 0.17146070301532745, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 5.727630615234375, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 0.16982458531856537, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 15.856001853942871, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 0.003530273912474513, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 14.64710807800293, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 0.12964509427547455, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 10.184660911560059, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 0.10061006247997284, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 16.53177261352539, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 0.004003362730145454, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 8.919732093811035, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 0.015549445524811745, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 7.995266437530518, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 0.027941172942519188, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 5.739104270935059, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 0.12097016721963882, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 5.839469909667969, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 0.08148237317800522, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 16.083419799804688, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 0.002813854021951556, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 15.252964973449707, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 0.07614483684301376, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 10.386521339416504, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 0.05588327720761299, "pnorm/_forward_module._fsdp_wrapped_module.norm.weight": 21.05142593383789, "gnorm/_forward_module._fsdp_wrapped_module.norm.weight": 0.005237509496510029, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 191.32125854492188, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.0383199080824852} +{"step": 1111490560, "pnorm/_forward_module._fsdp_wrapped_module.emb.weight": 99.51710510253906, "gnorm/_forward_module._fsdp_wrapped_module.emb.weight": 0.10261762142181396, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 16.169281005859375, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 0.004577391780912876, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 7.534152984619141, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 0.019993096590042114, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 7.365728855133057, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 0.02689042128622532, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 5.7506279945373535, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 0.1656954139471054, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 5.719189643859863, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 0.16864211857318878, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 15.852669715881348, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 0.003818846307694912, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 14.647116661071777, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 0.13011693954467773, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 10.183304786682129, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 0.0983070507645607, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 16.54902458190918, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 0.003463003085926175, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 8.98132610321045, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 0.015732763335108757, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 8.03524112701416, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 0.027592742815613747, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 5.7440996170043945, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 0.11414869874715805, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 5.845987319946289, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 0.08431782573461533, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 16.08522605895996, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 0.0036569680087268353, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 15.261557579040527, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 0.08886972069740295, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 10.392251968383789, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 0.066740021109581, "pnorm/_forward_module._fsdp_wrapped_module.norm.weight": 21.111995697021484, "gnorm/_forward_module._fsdp_wrapped_module.norm.weight": 0.0077429767698049545, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 191.85159301757812, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.043542515486478806} +{"step": 1132462080, "pnorm/_forward_module._fsdp_wrapped_module.emb.weight": 99.62793731689453, "gnorm/_forward_module._fsdp_wrapped_module.emb.weight": 0.13069023191928864, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 16.165536880493164, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 0.005904384423047304, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 7.543851852416992, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 0.026175400242209435, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 7.372917175292969, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 0.03537069261074066, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 5.743511199951172, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 0.2109449803829193, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 5.71150541305542, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 0.22618553042411804, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 15.849400520324707, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 0.005032227374613285, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 14.646993637084961, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 0.1810511201620102, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 10.182018280029297, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 0.13098202645778656, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 16.565664291381836, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 0.004869219847023487, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 9.040513038635254, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 0.017200561240315437, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 8.07378101348877, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 0.03385179862380028, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 5.748708724975586, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 0.17519287765026093, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 5.851814270019531, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 0.1096656545996666, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 16.086177825927734, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 0.0061867074109613895, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 15.268401145935059, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 0.14051347970962524, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 10.397008895874023, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 0.09724359214305878, "pnorm/_forward_module._fsdp_wrapped_module.norm.weight": 21.16884422302246, "gnorm/_forward_module._fsdp_wrapped_module.norm.weight": 0.005828025285154581, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 192.3588409423828, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.049874208867549896} +{"step": 1153433600, "pnorm/_forward_module._fsdp_wrapped_module.emb.weight": 99.73089599609375, "gnorm/_forward_module._fsdp_wrapped_module.emb.weight": 0.10678752511739731, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 16.1622371673584, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 0.0039215534925460815, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 7.553075790405273, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 0.02281087078154087, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 7.380390167236328, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 0.02807822823524475, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 5.736808776855469, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 0.1585036665201187, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 5.704453945159912, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 0.1731693148612976, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 15.846071243286133, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 0.004054786171764135, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 14.645956039428711, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 0.1326330453157425, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 10.179986953735352, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 0.10320248454809189, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 16.58074951171875, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 0.0037697702646255493, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 9.095364570617676, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 0.020630858838558197, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 8.109469413757324, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 0.039853211492300034, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 5.7528300285339355, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 0.12194225937128067, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 5.857038497924805, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 0.08380627632141113, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 16.08710289001465, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 0.0035412313882261515, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 15.27432918548584, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 0.09323986619710922, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 10.400458335876465, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 0.06791810691356659, "pnorm/_forward_module._fsdp_wrapped_module.norm.weight": 21.221506118774414, "gnorm/_forward_module._fsdp_wrapped_module.norm.weight": 0.008614501915872097, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 192.83981323242188, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.05064322426915169} +{"step": 1174405120, "pnorm/_forward_module._fsdp_wrapped_module.emb.weight": 99.82572937011719, "gnorm/_forward_module._fsdp_wrapped_module.emb.weight": 0.11042385548353195, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 16.1575927734375, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 0.004367611836642027, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 7.559239387512207, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 0.0196452084928751, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 7.385103225708008, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 0.025604691356420517, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 5.728788375854492, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 0.15740282833576202, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 5.696139812469482, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 0.16954313218593597, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 15.842945098876953, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 0.003942230250686407, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 14.644637107849121, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 0.13185076415538788, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 10.17810344696045, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 0.10032328963279724, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 16.595897674560547, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 0.0040670959278941154, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 9.147873878479004, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 0.012658125720918179, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 8.143744468688965, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 0.024860935285687447, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 5.7567667961120605, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 0.12847059965133667, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 5.862599849700928, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 0.0836367979645729, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 16.088754653930664, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 0.0032165844459086657, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 15.281238555908203, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 0.08514904975891113, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 10.405092239379883, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 0.060688819736242294, "pnorm/_forward_module._fsdp_wrapped_module.norm.weight": 21.275623321533203, "gnorm/_forward_module._fsdp_wrapped_module.norm.weight": 0.007780879735946655, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 193.300048828125, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.0456763431429863} +{"step": 1195376640, "pnorm/_forward_module._fsdp_wrapped_module.emb.weight": 99.91314697265625, "gnorm/_forward_module._fsdp_wrapped_module.emb.weight": 0.10953246802091599, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 16.15363883972168, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 0.004151355009526014, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 7.569363117218018, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 0.024145133793354034, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 7.3932414054870605, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 0.034916266798973083, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 5.720333099365234, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 0.16212034225463867, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 5.687307357788086, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 0.18030470609664917, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 15.839784622192383, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 0.004338525235652924, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 14.643430709838867, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 0.1334514170885086, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 10.176416397094727, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 0.10680387914180756, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 16.612409591674805, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 0.004182939417660236, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 9.204498291015625, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 0.01446219440549612, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 8.180428504943848, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 0.028364308178424835, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 5.76096248626709, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 0.13335634768009186, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 5.868084907531738, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 0.09441198408603668, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 16.09046745300293, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 0.006756588816642761, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 15.287820816040039, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 0.1399230808019638, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 10.409870147705078, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 0.10243535786867142, "pnorm/_forward_module._fsdp_wrapped_module.norm.weight": 21.329256057739258, "gnorm/_forward_module._fsdp_wrapped_module.norm.weight": 0.010027124546468258, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 193.73516845703125, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.047234147787094116} +{"step": 1216348160, "pnorm/_forward_module._fsdp_wrapped_module.emb.weight": 99.9939956665039, "gnorm/_forward_module._fsdp_wrapped_module.emb.weight": 0.11042178422212601, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 16.1481990814209, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 0.005023838020861149, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 7.574084758758545, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 0.02227718196809292, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 7.396957874298096, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 0.02970276027917862, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 5.711904048919678, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 0.1767989546060562, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 5.678636074066162, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 0.17843908071517944, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 15.837717056274414, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 0.0037223028484731913, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 14.643352508544922, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 0.13896018266677856, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 10.175247192382812, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 0.10621888190507889, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 16.62856101989746, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 0.004566980060189962, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 9.258594512939453, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 0.013785107061266899, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 8.215352058410645, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 0.02609253115952015, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 5.76540470123291, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 0.13490396738052368, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 5.873927116394043, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 0.09564508497714996, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 16.0914249420166, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 0.006907867733389139, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 15.293163299560547, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 0.1471281796693802, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 10.41357421875, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 0.11237628757953644, "pnorm/_forward_module._fsdp_wrapped_module.norm.weight": 21.379436492919922, "gnorm/_forward_module._fsdp_wrapped_module.norm.weight": 0.009362151846289635, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 194.1486053466797, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.05150788277387619} +{"step": 1237319680, "pnorm/_forward_module._fsdp_wrapped_module.emb.weight": 100.0675048828125, "gnorm/_forward_module._fsdp_wrapped_module.emb.weight": 0.08126620203256607, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 16.14324188232422, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 0.003433443605899811, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 7.580269813537598, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 0.015404362231492996, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 7.4021315574646, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 0.020463185384869576, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 5.703219413757324, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 0.1265307366847992, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 5.6697540283203125, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 0.13534244894981384, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 15.8355131149292, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 0.002595198340713978, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 14.642898559570312, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 0.1068185344338417, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 10.174034118652344, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 0.08055013418197632, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 16.64443016052246, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 0.003090417245402932, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 9.311223030090332, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 0.010202044621109962, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 8.249587059020996, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 0.017922818660736084, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 5.769182205200195, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 0.08989892899990082, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 5.879199504852295, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 0.06585842370986938, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 16.092729568481445, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 0.0026755386497825384, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 15.298818588256836, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 0.06866078078746796, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 10.417393684387207, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 0.0559677854180336, "pnorm/_forward_module._fsdp_wrapped_module.norm.weight": 21.428245544433594, "gnorm/_forward_module._fsdp_wrapped_module.norm.weight": 0.006531708873808384, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 194.54116821289062, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.03591557964682579} +{"step": 1258291200, "pnorm/_forward_module._fsdp_wrapped_module.emb.weight": 100.13485717773438, "gnorm/_forward_module._fsdp_wrapped_module.emb.weight": 0.08954710513353348, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 16.13819122314453, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 0.003745914436876774, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 7.586771011352539, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 0.01742551103234291, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 7.407729625701904, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 0.021250618621706963, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 5.694828987121582, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 0.14479689300060272, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 5.661207675933838, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 0.15436650812625885, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 15.832619667053223, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 0.0032646814361214638, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 14.641375541687012, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 0.12537126243114471, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 10.17227840423584, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 0.09559472650289536, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 16.66010093688965, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 0.003267421619966626, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 9.361787796020508, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 0.015643587335944176, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 8.282840728759766, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 0.02938665635883808, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 5.773437976837158, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 0.12179148197174072, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 5.884770393371582, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 0.07559563219547272, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 16.09459686279297, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 0.0032304697670042515, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 15.30481243133545, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 0.07933464646339417, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 10.421708106994629, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 0.05954549089074135, "pnorm/_forward_module._fsdp_wrapped_module.norm.weight": 21.476755142211914, "gnorm/_forward_module._fsdp_wrapped_module.norm.weight": 0.006471520289778709, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 194.9129638671875, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.03802981972694397} +{"step": 1279262720, "pnorm/_forward_module._fsdp_wrapped_module.emb.weight": 100.19542694091797, "gnorm/_forward_module._fsdp_wrapped_module.emb.weight": 0.09871192276477814, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 16.133054733276367, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 0.003563911886885762, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 7.590018272399902, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 0.020566413179039955, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 7.410157680511475, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 0.02640651911497116, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 5.687102317810059, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 0.1471705585718155, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 5.653360366821289, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 0.15542767941951752, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 15.830120086669922, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 0.003546996507793665, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 14.63992691040039, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 0.123995341360569, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 10.170363426208496, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 0.09316641092300415, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 16.675003051757812, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 0.003689272329211235, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 9.40910530090332, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 0.011871724389493465, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 8.313949584960938, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 0.02250170148909092, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 5.777821063995361, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 0.11209675669670105, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 5.890364170074463, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 0.07701361924409866, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 16.09558868408203, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 0.0027470062486827374, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 15.309333801269531, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 0.0809108167886734, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 10.425063133239746, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 0.054057348519563675, "pnorm/_forward_module._fsdp_wrapped_module.norm.weight": 21.523096084594727, "gnorm/_forward_module._fsdp_wrapped_module.norm.weight": 0.00518078776076436, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 195.2634735107422, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.03379831090569496} +{"step": 1300234240, "pnorm/_forward_module._fsdp_wrapped_module.emb.weight": 100.2499771118164, "gnorm/_forward_module._fsdp_wrapped_module.emb.weight": 0.11221474409103394, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 16.127809524536133, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 0.004768854007124901, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 7.593750476837158, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 0.021581454202532768, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 7.412952899932861, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 0.027377942577004433, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 5.679348468780518, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 0.18056850135326385, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 5.645517349243164, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 0.18548670411109924, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 15.827853202819824, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 0.004720362368971109, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 14.638736724853516, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 0.15515626966953278, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 10.168761253356934, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 0.11475449055433273, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 16.68997573852539, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 0.00405837269499898, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 9.455126762390137, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 0.01320855226367712, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 8.343771934509277, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 0.024272017180919647, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 5.782304763793945, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 0.13223250210285187, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 5.895991802215576, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 0.0842290073633194, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 16.096431732177734, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 0.0031166630797088146, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 15.313030242919922, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 0.0886552706360817, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 10.42792797088623, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 0.06411922723054886, "pnorm/_forward_module._fsdp_wrapped_module.norm.weight": 21.567041397094727, "gnorm/_forward_module._fsdp_wrapped_module.norm.weight": 0.005809097085148096, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 195.59564208984375, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.03759434446692467} +{"step": 1321205760, "pnorm/_forward_module._fsdp_wrapped_module.emb.weight": 100.29916381835938, "gnorm/_forward_module._fsdp_wrapped_module.emb.weight": 0.10278281569480896, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 16.12339210510254, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 0.003787414403632283, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 7.598418235778809, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 0.02260435000061989, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 7.416660308837891, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 0.028377557173371315, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 5.672183513641357, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 0.14574773609638214, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 5.6381425857543945, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 0.17010889947414398, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 15.825716972351074, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 0.003653018968179822, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 14.637194633483887, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 0.13885891437530518, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 10.166963577270508, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 0.10282708704471588, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 16.704147338867188, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 0.0040541947819292545, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 9.500099182128906, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 0.016729120165109634, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 8.372617721557617, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 0.0352151021361351, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 5.786229133605957, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 0.13271813094615936, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 5.900993824005127, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 0.0788642093539238, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 16.097108840942383, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 0.0032519011292606592, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 15.315937042236328, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 0.09157442301511765, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 10.430418014526367, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 0.0669991597533226, "pnorm/_forward_module._fsdp_wrapped_module.norm.weight": 21.608863830566406, "gnorm/_forward_module._fsdp_wrapped_module.norm.weight": 0.005451614037156105, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 195.9062042236328, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.04422377049922943} +{"step": 1342177280, "pnorm/_forward_module._fsdp_wrapped_module.emb.weight": 100.34284210205078, "gnorm/_forward_module._fsdp_wrapped_module.emb.weight": 0.10960704833269119, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 16.1191463470459, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 0.0041627176105976105, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 7.603427886962891, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 0.020786680281162262, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 7.420479774475098, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 0.02681453339755535, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 5.665090084075928, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 0.15402941405773163, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 5.630996227264404, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 0.16976523399353027, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 15.823486328125, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 0.003742033150047064, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 14.63558578491211, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 0.13300873339176178, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 10.165262222290039, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 0.1008535698056221, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 16.718093872070312, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 0.003913856111466885, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 9.542842864990234, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 0.016579488292336464, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 8.40060806274414, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 0.031488120555877686, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 5.790141582489014, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 0.11232790350914001, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 5.905934810638428, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 0.07383766770362854, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 16.098115921020508, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 0.002611933508887887, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 15.319554328918457, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 0.07528361678123474, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 10.433296203613281, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 0.056304093450307846, "pnorm/_forward_module._fsdp_wrapped_module.norm.weight": 21.649261474609375, "gnorm/_forward_module._fsdp_wrapped_module.norm.weight": 0.0062871212139725685, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 196.19830322265625, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.03777637332677841} +{"step": 1363148800, "pnorm/_forward_module._fsdp_wrapped_module.emb.weight": 100.38215637207031, "gnorm/_forward_module._fsdp_wrapped_module.emb.weight": 0.10324814170598984, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 16.11477279663086, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 0.004064860753715038, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 7.606633186340332, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 0.022044818848371506, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 7.423129081726074, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 0.030496088787913322, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 5.6579365730285645, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 0.16478000581264496, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 5.623669147491455, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 0.17789022624492645, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 15.821672439575195, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 0.004302822519093752, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 14.634427070617676, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 0.1436672955751419, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 10.16396427154541, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 0.10533807426691055, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 16.731456756591797, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 0.0036323072854429483, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 9.583086013793945, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 0.014814384281635284, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 8.427249908447266, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 0.02958487533032894, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 5.793625831604004, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 0.1254313588142395, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 5.910625457763672, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 0.08658352494239807, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 16.098894119262695, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 0.0058467756025493145, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 15.322229385375977, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 0.11921367049217224, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 10.435396194458008, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 0.09386109560728073, "pnorm/_forward_module._fsdp_wrapped_module.norm.weight": 21.688560485839844, "gnorm/_forward_module._fsdp_wrapped_module.norm.weight": 0.0050507523119449615, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 196.4755401611328, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.04307323321700096} +{"step": 1384120320, "pnorm/_forward_module._fsdp_wrapped_module.emb.weight": 100.41688537597656, "gnorm/_forward_module._fsdp_wrapped_module.emb.weight": 0.09656279534101486, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 16.11070442199707, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 0.0036614208947867155, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 7.611705303192139, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 0.022919777780771255, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 7.428008556365967, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 0.03207885101437569, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 5.651041507720947, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 0.14672449231147766, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 5.6166887283325195, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 0.1636475920677185, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 15.819934844970703, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 0.00342118670232594, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 14.633185386657715, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 0.1283842921257019, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 10.162425994873047, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 0.09942417591810226, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 16.744441986083984, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 0.0032699715811759233, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 9.621515274047852, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 0.013561628758907318, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 8.452682495117188, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 0.024110734462738037, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 5.7972731590271, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 0.10735034197568893, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 5.915194511413574, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 0.07556638866662979, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 16.099796295166016, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 0.0028208065778017044, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 15.325056076049805, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 0.07528257369995117, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 10.437824249267578, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 0.056674059480428696, "pnorm/_forward_module._fsdp_wrapped_module.norm.weight": 21.726093292236328, "gnorm/_forward_module._fsdp_wrapped_module.norm.weight": 0.0056298160925507545, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 196.73617553710938, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.037217073142528534} +{"step": 1405091840, "pnorm/_forward_module._fsdp_wrapped_module.emb.weight": 100.44718933105469, "gnorm/_forward_module._fsdp_wrapped_module.emb.weight": 0.11006606370210648, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 16.106863021850586, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 0.0036891591735184193, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 7.61610746383667, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 0.0257908646017313, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 7.4316086769104, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 0.027132820338010788, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 5.644479751586914, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 0.14065809547901154, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 5.610052108764648, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 0.16275393962860107, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 15.818203926086426, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 0.003499683691188693, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 14.631647109985352, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 0.12823785841464996, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 10.160761833190918, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 0.10345868021249771, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 16.756887435913086, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 0.003807327477261424, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 9.658236503601074, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 0.02285510115325451, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 8.477005004882812, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 0.04307437688112259, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 5.800665378570557, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 0.11790686845779419, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 5.919553756713867, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 0.07686188817024231, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 16.10051918029785, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 0.0038811853155493736, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 15.327200889587402, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 0.08836697787046432, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 10.439923286437988, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 0.06837569922208786, "pnorm/_forward_module._fsdp_wrapped_module.norm.weight": 21.761863708496094, "gnorm/_forward_module._fsdp_wrapped_module.norm.weight": 0.007920476607978344, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 196.978515625, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.04078070446848869} +{"step": 1426063360, "pnorm/_forward_module._fsdp_wrapped_module.emb.weight": 100.47371673583984, "gnorm/_forward_module._fsdp_wrapped_module.emb.weight": 0.09424302726984024, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 16.102859497070312, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 0.003610918764024973, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 7.619541168212891, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 0.01977868378162384, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 7.434969902038574, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 0.026562368497252464, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 5.6380815505981445, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 0.13663454353809357, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 5.603567600250244, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 0.15135708451271057, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 15.816895484924316, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 0.0033794252667576075, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 14.630738258361816, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 0.11876177042722702, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 10.159281730651855, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 0.08958183974027634, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 16.769739151000977, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 0.0032948534935712814, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 9.695890426635742, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 0.010132365860044956, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 8.501805305480957, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 0.01895921118557453, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 5.8039350509643555, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 0.10366138815879822, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 5.923823833465576, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 0.07286156713962555, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 16.101102828979492, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 0.0043161059729754925, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 15.32918643951416, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 0.0949680283665657, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 10.441584587097168, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 0.07881379872560501, "pnorm/_forward_module._fsdp_wrapped_module.norm.weight": 21.79617691040039, "gnorm/_forward_module._fsdp_wrapped_module.norm.weight": 0.004656536970287561, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 197.20993041992188, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.03497990220785141} +{"step": 1447034880, "pnorm/_forward_module._fsdp_wrapped_module.emb.weight": 100.49663543701172, "gnorm/_forward_module._fsdp_wrapped_module.emb.weight": 0.11512558162212372, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 16.098628997802734, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 0.004086926579475403, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 7.622125625610352, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 0.029272258281707764, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 7.437264442443848, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 0.03845778852701187, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 5.631668567657471, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 0.1502557396888733, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 5.596986770629883, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 0.15774253010749817, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 15.815567016601562, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 0.0031649903394281864, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 14.629703521728516, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 0.1241379976272583, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 10.15784740447998, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 0.09292802214622498, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 16.781526565551758, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 0.0030237578321248293, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 9.729691505432129, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 0.013096564449369907, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 8.524120330810547, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 0.023625878617167473, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 5.8071064949035645, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 0.10084657371044159, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 5.927951335906982, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 0.06842400133609772, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 16.102022171020508, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 0.0028542103245854378, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 15.33162784576416, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 0.07350765913724899, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 10.443727493286133, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 0.056138571351766586, "pnorm/_forward_module._fsdp_wrapped_module.norm.weight": 21.828886032104492, "gnorm/_forward_module._fsdp_wrapped_module.norm.weight": 0.005975104868412018, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 197.42555236816406, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.03249708190560341} +{"step": 1468006400, "pnorm/_forward_module._fsdp_wrapped_module.emb.weight": 100.51661682128906, "gnorm/_forward_module._fsdp_wrapped_module.emb.weight": 0.10464347898960114, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 16.094589233398438, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 0.003468578215688467, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 7.624521732330322, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 0.02272045984864235, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 7.439393520355225, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 0.030367059633135796, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 5.62568473815918, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 0.13657274842262268, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 5.590854644775391, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 0.15473392605781555, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 15.814160346984863, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 0.0034955348819494247, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 14.628233909606934, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 0.12881897389888763, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 10.156262397766113, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 0.09839325398206711, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 16.792911529541016, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 0.003705976065248251, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 9.761959075927734, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 0.017124956473708153, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 8.545334815979004, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 0.03441246598958969, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 5.8103156089782715, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 0.11753373593091965, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 5.931852340698242, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 0.0732162669301033, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 16.102859497070312, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 0.0028020062018185854, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 15.333687782287598, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 0.0766354575753212, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 10.445769309997559, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 0.05453566089272499, "pnorm/_forward_module._fsdp_wrapped_module.norm.weight": 21.859838485717773, "gnorm/_forward_module._fsdp_wrapped_module.norm.weight": 0.004885174334049225, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 197.625732421875, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.03744090721011162} +{"step": 1488977920, "pnorm/_forward_module._fsdp_wrapped_module.emb.weight": 100.5338134765625, "gnorm/_forward_module._fsdp_wrapped_module.emb.weight": 0.08518777787685394, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 16.09157371520996, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 0.0033081916626542807, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 7.628973960876465, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 0.018390273675322533, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 7.4430317878723145, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 0.022256236523389816, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 5.620120525360107, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 0.12381623685359955, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 5.585317611694336, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 0.1399444192647934, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 15.813178062438965, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 0.002903360640630126, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 14.627225875854492, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 0.11173885315656662, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 10.155080795288086, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 0.0879414826631546, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 16.803495407104492, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 0.0030338780488818884, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 9.792614936828613, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 0.012108929455280304, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 8.565630912780762, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 0.023586425930261612, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 5.812748432159424, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 0.09567996859550476, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 5.934937477111816, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 0.06434805691242218, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 16.10358428955078, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 0.0021081482991576195, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 15.335415840148926, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 0.061479948461055756, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 10.447702407836914, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 0.04817603901028633, "pnorm/_forward_module._fsdp_wrapped_module.norm.weight": 21.889286041259766, "gnorm/_forward_module._fsdp_wrapped_module.norm.weight": 0.0054357764311134815, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 197.8143310546875, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.03109530732035637} +{"step": 1509949440, "pnorm/_forward_module._fsdp_wrapped_module.emb.weight": 100.54817199707031, "gnorm/_forward_module._fsdp_wrapped_module.emb.weight": 0.09021475166082382, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 16.087974548339844, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 0.003477030200883746, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 7.631232738494873, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 0.02023455873131752, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 7.444901943206787, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 0.02404443919658661, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 5.614402770996094, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 0.13754808902740479, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 5.579517364501953, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 0.14726564288139343, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 15.812210083007812, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 0.0031919998582452536, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 14.626157760620117, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 0.12067253887653351, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 10.154047966003418, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 0.09585736691951752, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 16.813291549682617, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 0.004003292415291071, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 9.820880889892578, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 0.012079577893018723, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 8.584528923034668, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 0.021605322137475014, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 5.8148579597473145, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 0.1055353656411171, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 5.937810897827148, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 0.06641455739736557, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 16.10466957092285, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 0.0030318759381771088, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 15.337642669677734, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 0.07632604241371155, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 10.449742317199707, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 0.06112958863377571, "pnorm/_forward_module._fsdp_wrapped_module.norm.weight": 21.91703224182129, "gnorm/_forward_module._fsdp_wrapped_module.norm.weight": 0.007021838799118996, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 197.989501953125, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.03293348476290703} +{"step": 1530920960, "pnorm/_forward_module._fsdp_wrapped_module.emb.weight": 100.55994415283203, "gnorm/_forward_module._fsdp_wrapped_module.emb.weight": 0.10096747428178787, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 16.08427619934082, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 0.0033678163308650255, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 7.632856845855713, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 0.02375645935535431, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 7.446295738220215, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 0.03367307037115097, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 5.608966827392578, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 0.1321885585784912, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 5.574111461639404, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 0.1525215059518814, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 15.810826301574707, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 0.0036075985990464687, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 14.6244535446167, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 0.12272728979587555, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 10.152800559997559, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 0.09255269914865494, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 16.823551177978516, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 0.0036229887045919895, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 9.849331855773926, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 0.015776799991726875, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 8.60354995727539, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 0.028399622067809105, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 5.817602157592773, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 0.10295862704515457, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 5.941259860992432, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 0.07049395143985748, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 16.105506896972656, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 0.0028336115647107363, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 15.339323043823242, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 0.07355739176273346, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 10.451334953308105, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 0.05596111714839935, "pnorm/_forward_module._fsdp_wrapped_module.norm.weight": 21.943273544311523, "gnorm/_forward_module._fsdp_wrapped_module.norm.weight": 0.0063399518840014935, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 198.15220642089844, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.03598237782716751} +{"step": 1551892480, "pnorm/_forward_module._fsdp_wrapped_module.emb.weight": 100.5697021484375, "gnorm/_forward_module._fsdp_wrapped_module.emb.weight": 0.10461777448654175, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 16.080890655517578, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 0.0034848267678171396, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 7.63512659072876, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 0.025102591142058372, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 7.448123931884766, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 0.03083587996661663, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 5.603837490081787, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 0.13985797762870789, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 5.56892204284668, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 0.16150301694869995, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 15.809966087341309, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 0.003637980669736862, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 14.62336540222168, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 0.1342935860157013, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 10.15176010131836, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 0.10289040207862854, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 16.833154678344727, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 0.003464378649368882, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 9.875879287719727, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 0.017768556252121925, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 8.621347427368164, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 0.0348367765545845, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 5.8202667236328125, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 0.11764217913150787, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 5.9446563720703125, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 0.07428672164678574, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 16.106042861938477, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 0.003003376070410013, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 15.340338706970215, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 0.08229371160268784, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 10.452560424804688, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 0.06102769449353218, "pnorm/_forward_module._fsdp_wrapped_module.norm.weight": 21.967803955078125, "gnorm/_forward_module._fsdp_wrapped_module.norm.weight": 0.005212740041315556, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 198.30467224121094, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.03820241987705231} +{"step": 1572864000, "pnorm/_forward_module._fsdp_wrapped_module.emb.weight": 100.5775146484375, "gnorm/_forward_module._fsdp_wrapped_module.emb.weight": 0.09518817067146301, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 16.0781307220459, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 0.0036048809997737408, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 7.638181209564209, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 0.022322332486510277, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 7.4507670402526855, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 0.028182996436953545, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 5.598881244659424, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 0.13334546983242035, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 5.564000606536865, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 0.14738865196704865, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 15.809165000915527, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 0.003637021640315652, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 14.622492790222168, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 0.12072375416755676, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 10.150833129882812, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 0.09144651889801025, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 16.841875076293945, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 0.0033366631250828505, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 9.90030288696289, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 0.01226099207997322, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 8.6376953125, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 0.021990539506077766, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 5.8222455978393555, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 0.11169242113828659, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 5.947127342224121, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 0.0730045735836029, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 16.107131958007812, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 0.004203238524496555, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 15.34227466583252, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 0.0963825136423111, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 10.4542875289917, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 0.0763736143708229, "pnorm/_forward_module._fsdp_wrapped_module.norm.weight": 21.990785598754883, "gnorm/_forward_module._fsdp_wrapped_module.norm.weight": 0.005265274550765753, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 198.44671630859375, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.03329169377684593} +{"step": 1593835520, "pnorm/_forward_module._fsdp_wrapped_module.emb.weight": 100.58390045166016, "gnorm/_forward_module._fsdp_wrapped_module.emb.weight": 0.08312865346670151, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 16.075027465820312, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 0.0031066760420799255, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 7.640066623687744, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 0.01700853742659092, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 7.452343940734863, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 0.02235587127506733, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 5.594245910644531, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 0.12412746250629425, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 5.559359073638916, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 0.14277300238609314, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 15.808320045471191, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 0.0030739831272512674, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 14.621419906616211, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 0.11295574903488159, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 10.149942398071289, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 0.0870991051197052, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 16.850257873535156, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 0.0033084002789109945, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 9.9229097366333, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 0.010682780295610428, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 8.652841567993164, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 0.01796749420464039, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 5.824498653411865, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 0.09512428194284439, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 5.949988842010498, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 0.06383363157510757, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 16.1076602935791, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 0.0024511832743883133, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 15.34316349029541, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 0.06606201827526093, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 10.455531120300293, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 0.05264189839363098, "pnorm/_forward_module._fsdp_wrapped_module.norm.weight": 22.012174606323242, "gnorm/_forward_module._fsdp_wrapped_module.norm.weight": 0.006247222889214754, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 198.57688903808594, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.033084601163864136} +{"step": 1614807040, "pnorm/_forward_module._fsdp_wrapped_module.emb.weight": 100.58882904052734, "gnorm/_forward_module._fsdp_wrapped_module.emb.weight": 0.0980665385723114, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 16.07250213623047, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 0.0033320847433060408, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 7.642020225524902, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 0.023087112233042717, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 7.453935623168945, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 0.032495688647031784, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 5.590025424957275, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 0.12298639863729477, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 5.555104732513428, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 0.13781341910362244, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 15.807616233825684, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 0.002913964679464698, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 14.620368957519531, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 0.1114140972495079, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 10.149042129516602, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 0.0850197970867157, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 16.857994079589844, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 0.0033148080110549927, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 9.943581581115723, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 0.013220403343439102, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 8.666873931884766, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 0.023902839049696922, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 5.826478004455566, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 0.08832267671823502, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 5.952437877655029, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 0.06307312846183777, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 16.108299255371094, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 0.0028864543419331312, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 15.3441743850708, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 0.06995969265699387, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 10.456717491149902, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 0.054435912519693375, "pnorm/_forward_module._fsdp_wrapped_module.norm.weight": 22.031892776489258, "gnorm/_forward_module._fsdp_wrapped_module.norm.weight": 0.006227685138583183, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 198.69552612304688, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.03289317339658737} +{"step": 1635778560, "pnorm/_forward_module._fsdp_wrapped_module.emb.weight": 100.59245300292969, "gnorm/_forward_module._fsdp_wrapped_module.emb.weight": 0.0905216708779335, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 16.0703067779541, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 0.003659480717033148, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 7.644102096557617, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 0.02296612039208412, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 7.455720901489258, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 0.02937205694615841, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 5.586029052734375, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 0.11962096393108368, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 5.551176071166992, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 0.12685805559158325, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 15.807103157043457, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 0.002551042241975665, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 14.619512557983398, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 0.09832820296287537, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 10.14818000793457, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 0.07745198905467987, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 16.865251541137695, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 0.0025678451638668776, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 9.962973594665527, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 0.010527044534683228, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 8.680032730102539, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 0.018139051273465157, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 5.828149318695068, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 0.07756782323122025, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 5.954627513885498, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 0.05572282150387764, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 16.109128952026367, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 0.0016397872241213918, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 15.345386505126953, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 0.05164561793208122, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 10.457962989807129, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 0.04204130917787552, "pnorm/_forward_module._fsdp_wrapped_module.norm.weight": 22.050222396850586, "gnorm/_forward_module._fsdp_wrapped_module.norm.weight": 0.005794766824692488, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 198.8043212890625, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.02925288677215576} +{"step": 1656750080, "pnorm/_forward_module._fsdp_wrapped_module.emb.weight": 100.59473419189453, "gnorm/_forward_module._fsdp_wrapped_module.emb.weight": 0.0816243588924408, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 16.067716598510742, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 0.0028024015482515097, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 7.645333766937256, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 0.01852058246731758, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 7.456725120544434, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 0.02200108766555786, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 5.582137584686279, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 0.1122281476855278, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 5.5474324226379395, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 0.12576477229595184, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 15.806768417358398, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 0.002547306939959526, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 14.618886947631836, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 0.1009729728102684, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 10.147481918334961, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 0.07839463651180267, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 16.872146606445312, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 0.002667316934093833, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 9.980761528015137, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 0.00958893820643425, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 8.692242622375488, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 0.017366042360663414, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 5.829809188842773, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 0.07702487707138062, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 5.956698894500732, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 0.057846006006002426, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 16.10992431640625, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 0.0019642761908471584, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 15.34640121459961, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 0.0558006688952446, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 10.45925521850586, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 0.04217938333749771, "pnorm/_forward_module._fsdp_wrapped_module.norm.weight": 22.06722640991211, "gnorm/_forward_module._fsdp_wrapped_module.norm.weight": 0.006176256109029055, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 198.9021453857422, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.03080914355814457} +{"step": 1677721600, "pnorm/_forward_module._fsdp_wrapped_module.emb.weight": 100.59580993652344, "gnorm/_forward_module._fsdp_wrapped_module.emb.weight": 0.08563796430826187, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 16.065048217773438, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 0.0030856181401759386, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 7.645645618438721, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 0.019285378977656364, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 7.457001209259033, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 0.024114957079291344, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 5.578421592712402, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 0.1055862307548523, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 5.543761730194092, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 0.12332067638635635, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 15.80630111694336, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 0.0024219730403274298, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 14.618111610412598, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 0.10197530686855316, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 10.146697998046875, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 0.07730741798877716, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 16.878877639770508, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 0.002581618959084153, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 9.997550010681152, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 0.012435373850166798, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 8.703895568847656, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 0.022277694195508957, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 5.831686019897461, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 0.07733909785747528, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 5.9590044021606445, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 0.05738162621855736, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 16.110557556152344, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 0.002043263055384159, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 15.347229957580566, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 0.057157378643751144, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 10.460358619689941, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 0.042066529393196106, "pnorm/_forward_module._fsdp_wrapped_module.norm.weight": 22.082901000976562, "gnorm/_forward_module._fsdp_wrapped_module.norm.weight": 0.0054421573877334595, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 198.99171447753906, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.028474248945713043} +{"step": 1698693120, "pnorm/_forward_module._fsdp_wrapped_module.emb.weight": 100.59642791748047, "gnorm/_forward_module._fsdp_wrapped_module.emb.weight": 0.08811330795288086, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 16.063213348388672, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 0.0030612393748015165, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 7.647176265716553, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 0.018863601610064507, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 7.458370208740234, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 0.023970728740096092, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 5.575138568878174, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 0.1160741001367569, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 5.540529727935791, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 0.13522179424762726, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 15.805742263793945, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 0.0028608033899217844, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 14.617204666137695, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 0.10835839062929153, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 10.145988464355469, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 0.0864938348531723, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 16.884918212890625, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 0.0027340566739439964, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 10.012734413146973, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 0.014840261079370975, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 8.714370727539062, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 0.02741350792348385, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 5.833225727081299, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 0.08643805980682373, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 5.9607462882995605, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 0.058352384716272354, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 16.1114501953125, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 0.002182702301070094, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 15.348403930664062, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 0.06497368961572647, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 10.46161937713623, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 0.04947110265493393, "pnorm/_forward_module._fsdp_wrapped_module.norm.weight": 22.097124099731445, "gnorm/_forward_module._fsdp_wrapped_module.norm.weight": 0.005275843199342489, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 199.07386779785156, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.030688196420669556} +{"step": 1719664640, "pnorm/_forward_module._fsdp_wrapped_module.emb.weight": 100.59651184082031, "gnorm/_forward_module._fsdp_wrapped_module.emb.weight": 0.08273430913686752, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 16.061344146728516, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 0.0028052094858139753, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 7.648284912109375, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 0.019530462101101875, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 7.459368705749512, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 0.023452939465641975, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 5.572067737579346, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 0.10364118963479996, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 5.537477016448975, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 0.12514916062355042, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 15.805323600769043, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 0.0025788128841668367, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 14.616447448730469, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 0.10154633969068527, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 10.145349502563477, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 0.07844222337007523, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 16.890281677246094, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 0.00243179383687675, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 10.025952339172363, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 0.012458492070436478, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 8.723388671875, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 0.02375340834259987, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 5.834658622741699, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 0.07546690106391907, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 5.962488174438477, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 0.055164191871881485, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 16.111948013305664, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 0.002811805810779333, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 15.348939895629883, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 0.0683613270521164, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 10.462361335754395, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 0.053199924528598785, "pnorm/_forward_module._fsdp_wrapped_module.norm.weight": 22.109975814819336, "gnorm/_forward_module._fsdp_wrapped_module.norm.weight": 0.005107197444885969, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 199.14752197265625, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.030795389786362648} +{"step": 1740636160, "pnorm/_forward_module._fsdp_wrapped_module.emb.weight": 100.59601593017578, "gnorm/_forward_module._fsdp_wrapped_module.emb.weight": 0.06802743673324585, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 16.059690475463867, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 0.002596531994640827, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 7.649345397949219, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 0.016261449083685875, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 7.460267543792725, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 0.018258972093462944, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 5.569254398345947, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 0.09484529495239258, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 5.534746170043945, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 0.10946709662675858, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 15.805059432983398, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 0.002190608298406005, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 14.615757942199707, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 0.09005096554756165, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 10.14474105834961, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 0.06723713129758835, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 16.895118713378906, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 0.0022653371561318636, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 10.038155555725098, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 0.009134424850344658, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 8.73169231414795, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 0.017010800540447235, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 5.835818767547607, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 0.06651996821165085, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 5.963930606842041, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 0.049581728875637054, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 16.11277198791504, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 0.0016879468457773328, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 15.349937438964844, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 0.05056006833910942, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 10.46345043182373, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 0.0433671697974205, "pnorm/_forward_module._fsdp_wrapped_module.norm.weight": 22.121522903442383, "gnorm/_forward_module._fsdp_wrapped_module.norm.weight": 0.005727138835936785, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 199.21310424804688, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.028132688254117966} +{"step": 1761607680, "pnorm/_forward_module._fsdp_wrapped_module.emb.weight": 100.59518432617188, "gnorm/_forward_module._fsdp_wrapped_module.emb.weight": 0.06742790341377258, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 16.057941436767578, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 0.0022802934981882572, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 7.649971961975098, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 0.016534365713596344, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 7.460837364196777, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 0.01958926022052765, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 5.566618919372559, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 0.08813410252332687, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 5.532205104827881, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 0.10391626507043839, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 15.804863929748535, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 0.002172160428017378, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 14.615205764770508, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 0.08741600811481476, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 10.144279479980469, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 0.06610962748527527, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 16.8997802734375, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 0.0022658086381852627, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 10.04980754852295, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 0.009190279059112072, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 8.739691734313965, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 0.015036946162581444, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 5.83713436126709, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 0.06093938648700714, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 5.965534210205078, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 0.05032213404774666, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 16.11320686340332, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 0.0019690110348165035, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 15.350259780883789, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 0.05122349038720131, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 10.46403980255127, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 0.04084176570177078, "pnorm/_forward_module._fsdp_wrapped_module.norm.weight": 22.131864547729492, "gnorm/_forward_module._fsdp_wrapped_module.norm.weight": 0.005762050859630108, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 199.2718048095703, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.02834651991724968} +{"step": 1782579200, "pnorm/_forward_module._fsdp_wrapped_module.emb.weight": 100.5941162109375, "gnorm/_forward_module._fsdp_wrapped_module.emb.weight": 0.07118478417396545, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 16.056550979614258, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 0.002544283401221037, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 7.650886535644531, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 0.01910390704870224, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 7.461628437042236, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 0.021883007138967514, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 5.56433629989624, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 0.09159558266401291, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 5.529959678649902, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 0.10695591568946838, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 15.804656982421875, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 0.0022200511302798986, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 14.61470890045166, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 0.08629187196493149, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 10.143805503845215, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 0.06675545871257782, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 16.903884887695312, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 0.002486462239176035, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 10.059635162353516, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 0.009622693061828613, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 8.746520042419434, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 0.01693277433514595, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 5.838259696960449, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 0.06510142982006073, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 5.966840744018555, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 0.04977698251605034, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 16.113643646240234, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 0.0017059014644473791, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 15.350601196289062, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 0.049191657453775406, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 10.464545249938965, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 0.037911709398031235, "pnorm/_forward_module._fsdp_wrapped_module.norm.weight": 22.14101791381836, "gnorm/_forward_module._fsdp_wrapped_module.norm.weight": 0.0053207348100841045, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 199.32334899902344, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.028442520648241043} +{"step": 1803550720, "pnorm/_forward_module._fsdp_wrapped_module.emb.weight": 100.59281921386719, "gnorm/_forward_module._fsdp_wrapped_module.emb.weight": 0.07322364300489426, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 16.05528450012207, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 0.0027742807287722826, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 7.6512956619262695, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 0.016660025343298912, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 7.462059497833252, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 0.02061491645872593, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 5.562229633331299, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 0.09282883256673813, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 5.5279154777526855, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 0.1065104678273201, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 15.804506301879883, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 0.0023302286863327026, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 14.614202499389648, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 0.0876508355140686, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 10.143386840820312, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 0.06962878257036209, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 16.9075870513916, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 0.002087592612951994, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 10.06837272644043, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 0.010758820921182632, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 8.752614974975586, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 0.01909923367202282, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 5.839268207550049, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 0.0634303018450737, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 5.96802282333374, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 0.04877461493015289, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 16.114015579223633, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 0.001505451393313706, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 15.351001739501953, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 0.04814453050494194, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 10.465055465698242, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 0.03787975013256073, "pnorm/_forward_module._fsdp_wrapped_module.norm.weight": 22.1490478515625, "gnorm/_forward_module._fsdp_wrapped_module.norm.weight": 0.005910811945796013, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 199.3682098388672, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.027577748522162437} +{"step": 1824522240, "pnorm/_forward_module._fsdp_wrapped_module.emb.weight": 100.59149169921875, "gnorm/_forward_module._fsdp_wrapped_module.emb.weight": 0.06705673784017563, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 16.05409812927246, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 0.00220161909237504, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 7.651870250701904, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 0.015826839953660965, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 7.462704658508301, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 0.018986063078045845, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 5.5602593421936035, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 0.08652821183204651, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 5.526034355163574, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 0.10112912207841873, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 15.804545402526855, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 0.0021256967447698116, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 14.61398696899414, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 0.08507448434829712, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 10.143171310424805, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 0.0677327886223793, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 16.91075325012207, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 0.00235578091815114, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 10.0757474899292, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 0.009145873598754406, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 8.75783920288086, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 0.0171738900244236, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 5.840034008026123, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 0.06661565601825714, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 5.968958854675293, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 0.05122470483183861, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 16.114381790161133, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 0.0017236117273569107, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 15.35130500793457, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 0.052155449986457825, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 10.465478897094727, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 0.040287721902132034, "pnorm/_forward_module._fsdp_wrapped_module.norm.weight": 22.156064987182617, "gnorm/_forward_module._fsdp_wrapped_module.norm.weight": 0.006544350180774927, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 199.4071044921875, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.02853923663496971} +{"step": 1845493760, "pnorm/_forward_module._fsdp_wrapped_module.emb.weight": 100.59016418457031, "gnorm/_forward_module._fsdp_wrapped_module.emb.weight": 0.06881725788116455, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 16.053050994873047, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 0.002035262994468212, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 7.652202129364014, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 0.01701253466308117, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 7.463047504425049, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 0.02014586143195629, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 5.558579921722412, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 0.08464407175779343, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 5.524409770965576, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 0.09979041665792465, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 15.804444313049316, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 0.0019303364679217339, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 14.613592147827148, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 0.08291861414909363, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 10.142843246459961, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 0.06526073068380356, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 16.913740158081055, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 0.0021115359850227833, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 10.082672119140625, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 0.008931376039981842, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 8.762825012207031, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 0.015284308232367039, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 5.84077787399292, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 0.06149563938379288, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 5.969840049743652, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 0.047171223908662796, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 16.11473274230957, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 0.0013248755130916834, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 15.351616859436035, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 0.04553382098674774, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 10.465912818908691, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 0.034041959792375565, "pnorm/_forward_module._fsdp_wrapped_module.norm.weight": 22.162099838256836, "gnorm/_forward_module._fsdp_wrapped_module.norm.weight": 0.0056030903942883015, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 199.44036865234375, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.02663005329668522} +{"step": 1866465280, "pnorm/_forward_module._fsdp_wrapped_module.emb.weight": 100.58892822265625, "gnorm/_forward_module._fsdp_wrapped_module.emb.weight": 0.06834249198436737, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 16.05218505859375, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 0.002308154944330454, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 7.652685642242432, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 0.018213383853435516, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 7.463512420654297, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 0.021377665922045708, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 5.557131767272949, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 0.08318626135587692, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 5.523017406463623, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 0.09826971590518951, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 15.804410934448242, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 0.0020775655284523964, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 14.613351821899414, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 0.08498168736696243, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 10.1426362991333, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 0.06480573862791061, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 16.916309356689453, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 0.002107917331159115, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 10.088679313659668, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 0.00879908911883831, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 8.767138481140137, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 0.01537745725363493, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 5.841395378112793, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 0.05579721927642822, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 5.970555305480957, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 0.0470363087952137, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 16.11505699157715, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 0.001548465806990862, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 15.351895332336426, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 0.04741983488202095, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 10.466333389282227, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 0.03678864240646362, "pnorm/_forward_module._fsdp_wrapped_module.norm.weight": 22.167251586914062, "gnorm/_forward_module._fsdp_wrapped_module.norm.weight": 0.005153320264071226, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 199.4685516357422, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.025936828926205635} +{"step": 1887436800, "pnorm/_forward_module._fsdp_wrapped_module.emb.weight": 100.58776092529297, "gnorm/_forward_module._fsdp_wrapped_module.emb.weight": 0.07648882269859314, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 16.051605224609375, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 0.00272831111215055, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 7.653426647186279, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 0.017171518877148628, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 7.46422004699707, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 0.02052782103419304, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 5.555968761444092, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 0.09120749682188034, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 5.521909236907959, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 0.10948050022125244, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 15.80420970916748, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 0.002135969465598464, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 14.612955093383789, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 0.08951929211616516, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 10.142335891723633, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 0.06896931678056717, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 16.91848373413086, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 0.002432816429063678, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 10.09359359741211, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 0.012749407440423965, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 8.77066421508789, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 0.023984182626008987, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 5.841935157775879, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 0.06672021001577377, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 5.971160411834717, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 0.04834689572453499, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 16.115398406982422, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 0.0014450987800955772, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 15.352177619934082, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 0.04709923267364502, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 10.466703414916992, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 0.03627323731780052, "pnorm/_forward_module._fsdp_wrapped_module.norm.weight": 22.171566009521484, "gnorm/_forward_module._fsdp_wrapped_module.norm.weight": 0.005797601770609617, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 199.4918975830078, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.02743183635175228} +{"step": 1908408320, "pnorm/_forward_module._fsdp_wrapped_module.emb.weight": 100.58670806884766, "gnorm/_forward_module._fsdp_wrapped_module.emb.weight": 0.0702674463391304, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 16.05107307434082, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 0.0025623200926929712, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 7.65376615524292, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 0.016738275066018105, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 7.464536190032959, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 0.019840070977807045, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 5.554971694946289, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 0.08880723267793655, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 5.520930290222168, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 0.09862861782312393, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 15.80412769317627, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 0.002024028915911913, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 14.612693786621094, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 0.0806678757071495, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 10.142105102539062, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 0.06374058872461319, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 16.920358657836914, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 0.0021216708701103926, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 10.097932815551758, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 0.008361449465155602, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 8.773837089538574, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 0.013837913051247597, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 5.842374801635742, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 0.058941248804330826, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 5.971677303314209, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 0.04702870920300484, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 16.11559295654297, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 0.001651788828894496, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 15.352278709411621, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 0.04574809968471527, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 10.46690845489502, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 0.0363476388156414, "pnorm/_forward_module._fsdp_wrapped_module.norm.weight": 22.17512321472168, "gnorm/_forward_module._fsdp_wrapped_module.norm.weight": 0.005391886457800865, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 199.5112762451172, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.026426425203680992} +{"step": 1929379840, "pnorm/_forward_module._fsdp_wrapped_module.emb.weight": 100.58584594726562, "gnorm/_forward_module._fsdp_wrapped_module.emb.weight": 0.06877399235963821, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 16.050546646118164, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 0.002442852593958378, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 7.653867721557617, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 0.01849004253745079, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 7.464593410491943, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 0.020975850522518158, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 5.554152488708496, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 0.07946328073740005, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 5.5201239585876465, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 0.09325995296239853, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 15.804004669189453, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 0.002002761233597994, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 14.612405776977539, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 0.07848771661520004, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 10.141898155212402, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 0.06138831004500389, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 16.92191505432129, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 0.002363316947594285, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 10.101387023925781, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 0.008738306351006031, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 8.776355743408203, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 0.014557167887687683, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 5.842805862426758, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 0.05480518564581871, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 5.972168922424316, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 0.04581683501601219, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 16.11574935913086, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 0.001549404812976718, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 15.352367401123047, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 0.04604656621813774, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 10.46707820892334, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 0.037525348365306854, "pnorm/_forward_module._fsdp_wrapped_module.norm.weight": 22.1779727935791, "gnorm/_forward_module._fsdp_wrapped_module.norm.weight": 0.005703507456928492, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 199.5267791748047, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.026852020993828773} +{"step": 1950351360, "pnorm/_forward_module._fsdp_wrapped_module.emb.weight": 100.58511352539062, "gnorm/_forward_module._fsdp_wrapped_module.emb.weight": 0.06428030878305435, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 16.05013656616211, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 0.0021557544823735952, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 7.654019355773926, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 0.01733504794538021, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 7.4647369384765625, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 0.018953030928969383, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 5.553502559661865, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 0.07569063454866409, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 5.5194993019104, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 0.08922840654850006, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 15.803971290588379, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 0.0018673149170354009, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 14.612250328063965, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 0.07404480129480362, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 10.1417875289917, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 0.058588527143001556, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 16.92308235168457, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 0.0019557883497327566, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 10.104015350341797, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 0.00825242605060339, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 8.778233528137207, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 0.013446206226944923, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 5.843056678771973, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 0.05204443633556366, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 5.972471237182617, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 0.04222262278199196, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 16.115938186645508, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 0.001182898529805243, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 15.352487564086914, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 0.041749224066734314, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 10.467252731323242, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 0.031516045331954956, "pnorm/_forward_module._fsdp_wrapped_module.norm.weight": 22.18020248413086, "gnorm/_forward_module._fsdp_wrapped_module.norm.weight": 0.005318919196724892, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 199.53878784179688, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.0258607380092144} +{"step": 1971322880, "pnorm/_forward_module._fsdp_wrapped_module.emb.weight": 100.58450317382812, "gnorm/_forward_module._fsdp_wrapped_module.emb.weight": 0.06203605979681015, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 16.049800872802734, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 0.0020047840662300587, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 7.654036521911621, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 0.01504841260612011, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 7.464759826660156, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 0.01701122149825096, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 5.5529890060424805, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 0.07584802806377411, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 5.519007682800293, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 0.09050003439188004, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 15.803975105285645, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 0.0017324044601991773, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 14.612164497375488, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 0.07660967111587524, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 10.141711235046387, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 0.05883535370230675, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 16.923980712890625, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 0.0018739021616056561, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 10.105996131896973, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 0.008332628756761551, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 8.779678344726562, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 0.014505809172987938, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 5.843296051025391, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 0.057304684072732925, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 5.97273588180542, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 0.045966461300849915, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 16.11602020263672, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 0.0015162356430664659, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 15.35251522064209, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 0.04466233402490616, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 10.467347145080566, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 0.03465888649225235, "pnorm/_forward_module._fsdp_wrapped_module.norm.weight": 22.18187713623047, "gnorm/_forward_module._fsdp_wrapped_module.norm.weight": 0.006025814916938543, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 199.5478057861328, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.02665426954627037} +{"step": 1992294400, "pnorm/_forward_module._fsdp_wrapped_module.emb.weight": 100.58405303955078, "gnorm/_forward_module._fsdp_wrapped_module.emb.weight": 0.0638095885515213, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 16.04957389831543, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 0.002223935443907976, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 7.654131889343262, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 0.016005709767341614, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 7.464844703674316, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 0.018771179020404816, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 5.5526041984558105, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 0.07712769508361816, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 5.518633842468262, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 0.08898529410362244, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 15.80396842956543, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 0.0020545462612062693, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 14.612095832824707, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 0.0744268149137497, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 10.141640663146973, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 0.05876542255282402, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 16.92462730407715, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 0.001940262853167951, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 10.107378959655762, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 0.008333471603691578, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 8.7806978225708, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 0.013904232531785965, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 5.843446254730225, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 0.050806473940610886, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 5.972896575927734, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 0.04237227514386177, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 16.116125106811523, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 0.0012268649879842997, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 15.352591514587402, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 0.041933171451091766, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 10.467451095581055, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 0.03228326514363289, "pnorm/_forward_module._fsdp_wrapped_module.norm.weight": 22.183090209960938, "gnorm/_forward_module._fsdp_wrapped_module.norm.weight": 0.005144593305885792, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 199.5543212890625, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.02523198351264} +{"step": 2013265920, "pnorm/_forward_module._fsdp_wrapped_module.emb.weight": 100.583740234375, "gnorm/_forward_module._fsdp_wrapped_module.emb.weight": 0.06451544165611267, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 16.04938316345215, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 0.0018799642566591501, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 7.654186725616455, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 0.016048336401581764, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 7.464893341064453, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 0.018732529133558273, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 5.552312850952148, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 0.0755586251616478, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 5.518359184265137, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 0.0880025178194046, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 15.80398941040039, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 0.0018834559014067054, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 14.612067222595215, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 0.0729406401515007, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 10.141621589660645, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 0.05687751993536949, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 16.925079345703125, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 0.0021273379679769278, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 10.108317375183105, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 0.008793122135102749, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 8.781393051147461, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 0.014249571599066257, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 5.843568801879883, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 0.05082447826862335, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 5.973034381866455, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 0.04254208132624626, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 16.116161346435547, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 0.001311763422563672, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 15.352602005004883, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 0.04157622158527374, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 10.467490196228027, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 0.03288394585251808, "pnorm/_forward_module._fsdp_wrapped_module.norm.weight": 22.183900833129883, "gnorm/_forward_module._fsdp_wrapped_module.norm.weight": 0.00640131626278162, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 199.5586700439453, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.026961222290992737} +{"step": 2034237440, "pnorm/_forward_module._fsdp_wrapped_module.emb.weight": 100.58354949951172, "gnorm/_forward_module._fsdp_wrapped_module.emb.weight": 0.0664592981338501, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 16.04928207397461, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 0.0022061611525714397, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 7.654238700866699, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 0.018315162509679794, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 7.464941024780273, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 0.020605750381946564, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 5.5521321296691895, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 0.07608338445425034, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 5.5181884765625, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 0.09124623239040375, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 15.803998947143555, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 0.0018727615242823958, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 14.61205005645752, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 0.0741073340177536, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 10.14160442352295, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 0.058394353836774826, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 16.92535400390625, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 0.0021746207494288683, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 10.108904838562012, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 0.008938144892454147, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 8.781826972961426, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 0.014494165778160095, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 5.843640327453613, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 0.049914248287677765, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 5.973113536834717, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 0.04351605847477913, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 16.116201400756836, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 0.0013172830222174525, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 15.352622032165527, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 0.04255706071853638, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 10.467521667480469, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 0.03225645422935486, "pnorm/_forward_module._fsdp_wrapped_module.norm.weight": 22.18440055847168, "gnorm/_forward_module._fsdp_wrapped_module.norm.weight": 0.0059601496905088425, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 199.5613555908203, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.02595406025648117} +{"step": 2055208960, "pnorm/_forward_module._fsdp_wrapped_module.emb.weight": 100.58345794677734, "gnorm/_forward_module._fsdp_wrapped_module.emb.weight": 0.060306135565042496, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 16.049232482910156, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight": 0.0020705845672637224, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 7.654272556304932, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight": 0.015002491883933544, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 7.464975357055664, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight": 0.01683901622891426, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 5.5520405769348145, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight": 0.0670790895819664, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 5.518103122711182, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight": 0.0815739631652832, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 15.804009437561035, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight": 0.0018467297777533531, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 14.61204719543457, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight": 0.06852976977825165, "pnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 10.141596794128418, "gnorm/_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight": 0.05420786887407303, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 16.925504684448242, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight": 0.0019031126284971833, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 10.109222412109375, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight": 0.007973442785441875, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 8.782061576843262, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight": 0.0133456876501441, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 5.843677043914795, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight": 0.04845327511429787, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 5.973154544830322, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight": 0.04019509255886078, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 16.1162166595459, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight": 0.0011712023988366127, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 15.352622032165527, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight": 0.03975445777177811, "pnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 10.467531204223633, "gnorm/_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight": 0.031036842614412308, "pnorm/_forward_module._fsdp_wrapped_module.norm.weight": 22.184659957885742, "gnorm/_forward_module._fsdp_wrapped_module.norm.weight": 0.005932583939284086, "pnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 199.562744140625, "gnorm/_forward_module._fsdp_wrapped_module.lm_head.weight": 0.025545069947838783} diff --git a/metrics/jsonlines/throughput.jsonl b/metrics/jsonlines/throughput.jsonl index f2da0cfc8c478763ac059f4ea2757907caf1f18e..d1338068fb9bfd4aab3d9b8754cc46866adb8ad1 100644 --- a/metrics/jsonlines/throughput.jsonl +++ b/metrics/jsonlines/throughput.jsonl @@ -1,98 +1,98 @@ -{"step": 20971520, "throughput/token_count": 20971520, "throughput/batch_count": 10, "throughput/flop_count": 0, "throughput/total_time": 59.67119211301906, "throughput/update_time": 59.47943267202936, "throughput/token_count_per_second_total_recent": 374916.30932995316, "throughput/token_count_per_second_total_cum": 351451.3328354376, "throughput/token_count_per_second_update_recent": 375934.54917902773, "throughput/token_count_per_second_update_cum": 352584.3986380524, "throughput/batch_count_per_second_total_recent": 0.17877402750489862, "throughput/batch_count_per_second_total_cum": 0.1675850547959507, "throughput/batch_count_per_second_update_recent": 0.17925956210090052, "throughput/batch_count_per_second_update_cum": 0.16812534267332668, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} -{"step": 41943040, "throughput/token_count": 41943040, "throughput/batch_count": 20, "throughput/flop_count": 0, "throughput/total_time": 115.56162341398885, "throughput/update_time": 115.25218190002488, "throughput/token_count_per_second_total_recent": 375079.0244805578, "throughput/token_count_per_second_total_cum": 362949.5567896526, "throughput/token_count_per_second_update_recent": 375978.1111156034, "throughput/token_count_per_second_update_cum": 363924.0429858703, "throughput/batch_count_per_second_total_recent": 0.17885161613490955, "throughput/batch_count_per_second_total_cum": 0.1730678352306617, "throughput/batch_count_per_second_update_recent": 0.1792803340509431, "throughput/batch_count_per_second_update_cum": 0.1735325064591743, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} -{"step": 62914560, "throughput/token_count": 62914560, "throughput/batch_count": 30, "throughput/flop_count": 0, "throughput/total_time": 261.80084426596295, "throughput/update_time": 171.00324709707638, "throughput/token_count_per_second_total_recent": 240887.2232219027, "throughput/token_count_per_second_total_cum": 240314.58025431432, "throughput/token_count_per_second_update_recent": 376042.04355063406, "throughput/token_count_per_second_update_cum": 367914.41722907283, "throughput/batch_count_per_second_total_recent": 0.11486397896857391, "throughput/batch_count_per_second_total_cum": 0.11459092152324406, "throughput/batch_count_per_second_update_recent": 0.1793108194115801, "throughput/batch_count_per_second_update_cum": 0.17543526517346994, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} -{"step": 83886080, "throughput/token_count": 83886080, "throughput/batch_count": 40, "throughput/flop_count": 0, "throughput/total_time": 317.6585605319706, "throughput/update_time": 226.74485654599266, "throughput/token_count_per_second_total_recent": 265264.0392877065, "throughput/token_count_per_second_total_cum": 264076.2454489475, "throughput/token_count_per_second_update_recent": 376089.5497784418, "throughput/token_count_per_second_update_cum": 369958.0280577815, "throughput/batch_count_per_second_total_recent": 0.12648775066743206, "throughput/batch_count_per_second_total_cum": 0.12592136642882704, "throughput/batch_count_per_second_update_recent": 0.1793334721462449, "throughput/batch_count_per_second_update_cum": 0.17640973475350452, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} -{"step": 104857600, "throughput/token_count": 104857600, "throughput/batch_count": 50, "throughput/flop_count": 0, "throughput/total_time": 464.0044742010068, "throughput/update_time": 282.4727703850367, "throughput/token_count_per_second_total_recent": 226007.99388077136, "throughput/token_count_per_second_total_cum": 225984.02780611048, "throughput/token_count_per_second_update_recent": 376136.5262811839, "throughput/token_count_per_second_update_cum": 371213.1256300185, "throughput/batch_count_per_second_total_recent": 0.10776900953329628, "throughput/batch_count_per_second_total_cum": 0.1077575816183617, "throughput/batch_count_per_second_update_recent": 0.17935587228831476, "throughput/batch_count_per_second_update_cum": 0.17700821191311764, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} -{"step": 125829120, "throughput/token_count": 125829120, "throughput/batch_count": 60, "throughput/flop_count": 0, "throughput/total_time": 519.8498190940008, "throughput/update_time": 338.1969051870401, "throughput/token_count_per_second_total_recent": 242363.86814922682, "throughput/token_count_per_second_total_cum": 242048.9829529924, "throughput/token_count_per_second_update_recent": 376171.9068115677, "throughput/token_count_per_second_update_cum": 372058.7565117135, "throughput/batch_count_per_second_total_recent": 0.1155680981393942, "throughput/batch_count_per_second_total_cum": 0.11541794917726154, "throughput/batch_count_per_second_update_recent": 0.1793727430398787, "throughput/batch_count_per_second_update_cum": 0.1774114401396339, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} -{"step": 146800640, "throughput/token_count": 146800640, "throughput/batch_count": 70, "throughput/flop_count": 0, "throughput/total_time": 665.9225179139758, "throughput/update_time": 393.9277793511865, "throughput/token_count_per_second_total_recent": 220384.95923191067, "throughput/token_count_per_second_total_cum": 220447.0280714607, "throughput/token_count_per_second_update_recent": 376190.44498147035, "throughput/token_count_per_second_update_cum": 372658.76562903496, "throughput/batch_count_per_second_total_recent": 0.10508773767085584, "throughput/batch_count_per_second_total_cum": 0.10511733439991984, "throughput/batch_count_per_second_update_recent": 0.17938158272813337, "throughput/batch_count_per_second_update_cum": 0.17769754678203342, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} -{"step": 167772160, "throughput/token_count": 167772160, "throughput/batch_count": 80, "throughput/flop_count": 0, "throughput/total_time": 721.7491259319941, "throughput/update_time": 449.6537483881111, "throughput/token_count_per_second_total_recent": 232552.1714467396, "throughput/token_count_per_second_total_cum": 232452.18313684265, "throughput/token_count_per_second_update_recent": 376208.4814403935, "throughput/token_count_per_second_update_cum": 373114.1141409773, "throughput/batch_count_per_second_total_recent": 0.11088951656662922, "throughput/batch_count_per_second_total_cum": 0.11084183842508442, "throughput/batch_count_per_second_update_recent": 0.17939018318195032, "throughput/batch_count_per_second_update_cum": 0.17791467387246004, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} -{"step": 188743680, "throughput/token_count": 188743680, "throughput/batch_count": 90, "throughput/flop_count": 0, "throughput/total_time": 867.7238940689713, "throughput/update_time": 505.3731428361498, "throughput/token_count_per_second_total_recent": 217436.49572581126, "throughput/token_count_per_second_total_cum": 217515.82650897666, "throughput/token_count_per_second_update_recent": 376227.4518565005, "throughput/token_count_per_second_update_cum": 373473.90274990094, "throughput/batch_count_per_second_total_recent": 0.10368180071154177, "throughput/batch_count_per_second_total_cum": 0.1037196285767444, "throughput/batch_count_per_second_update_recent": 0.17939922898125674, "throughput/batch_count_per_second_update_cum": 0.17808623445029304, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} -{"step": 209715200, "throughput/token_count": 209715200, "throughput/batch_count": 100, "throughput/flop_count": 0, "throughput/total_time": 923.5610087590176, "throughput/update_time": 561.0979154942906, "throughput/token_count_per_second_total_recent": 227095.408003922, "throughput/token_count_per_second_total_cum": 227072.3839692982, "throughput/token_count_per_second_update_recent": 376238.9242932967, "throughput/token_count_per_second_update_cum": 373758.65104623424, "throughput/batch_count_per_second_total_recent": 0.10828752899356937, "throughput/batch_count_per_second_total_cum": 0.10827655027832898, "throughput/batch_count_per_second_update_recent": 0.1794046994654163, "throughput/batch_count_per_second_update_cum": 0.1782220130187198, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} -{"step": 230686720, "throughput/token_count": 230686720, "throughput/batch_count": 110, "throughput/flop_count": 0, "throughput/total_time": 1070.0474769069697, "throughput/update_time": 616.8178668163018, "throughput/token_count_per_second_total_recent": 206631.42889241298, "throughput/token_count_per_second_total_cum": 215585.4996890535, "throughput/token_count_per_second_update_recent": 376283.85767954245, "throughput/token_count_per_second_update_cum": 373994.87338245695, "throughput/batch_count_per_second_total_recent": 0.0985295433485093, "throughput/batch_count_per_second_total_cum": 0.10279917702152896, "throughput/batch_count_per_second_update_recent": 0.17942612537362215, "throughput/batch_count_per_second_update_cum": 0.17833465260622833, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} -{"step": 251658240, "throughput/token_count": 251658240, "throughput/batch_count": 120, "throughput/flop_count": 0, "throughput/total_time": 1125.8911167309852, "throughput/update_time": 672.5490376223461, "throughput/token_count_per_second_total_recent": 227063.40239931937, "throughput/token_count_per_second_total_cum": 223519.16296372196, "throughput/token_count_per_second_update_recent": 376311.58311418927, "throughput/token_count_per_second_update_cum": 374185.7112600802, "throughput/batch_count_per_second_total_recent": 0.10827226753202408, "throughput/batch_count_per_second_total_cum": 0.10658224247156237, "throughput/batch_count_per_second_update_recent": 0.1794393458910891, "throughput/batch_count_per_second_update_cum": 0.17842565119747172, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} -{"step": 272629760, "throughput/token_count": 272629760, "throughput/batch_count": 130, "throughput/flop_count": 0, "throughput/total_time": 1271.9843903059955, "throughput/update_time": 728.276082833414, "throughput/token_count_per_second_total_recent": 206669.85450724023, "throughput/token_count_per_second_total_cum": 214334.20258751343, "throughput/token_count_per_second_update_recent": 376325.62557704636, "throughput/token_count_per_second_update_cum": 374349.4622798994, "throughput/batch_count_per_second_total_recent": 0.09854786610948574, "throughput/batch_count_per_second_total_cum": 0.10220251206756278, "throughput/batch_count_per_second_update_recent": 0.17944604185917204, "throughput/batch_count_per_second_update_cum": 0.17850373376841516, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} -{"step": 293601280, "throughput/token_count": 293601280, "throughput/batch_count": 140, "throughput/flop_count": 0, "throughput/total_time": 1327.8194525539875, "throughput/update_time": 784.0098267712165, "throughput/token_count_per_second_total_recent": 227133.07164059338, "throughput/token_count_per_second_total_cum": 221115.36281176942, "throughput/token_count_per_second_update_recent": 376329.64667115675, "throughput/token_count_per_second_update_cum": 374486.7346996614, "throughput/batch_count_per_second_total_recent": 0.10830548841504735, "throughput/batch_count_per_second_total_cum": 0.10543602123821708, "throughput/batch_count_per_second_update_recent": 0.1794479592662605, "throughput/batch_count_per_second_update_cum": 0.17856919035895413, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} -{"step": 314572800, "throughput/token_count": 314572800, "throughput/batch_count": 150, "throughput/flop_count": 0, "throughput/total_time": 1473.913084711996, "throughput/update_time": 839.7419353383593, "throughput/token_count_per_second_total_recent": 206726.245976975, "throughput/token_count_per_second_total_cum": 213426.96748056065, "throughput/token_count_per_second_update_recent": 376326.36273890175, "throughput/token_count_per_second_update_cum": 374606.515123302, "throughput/batch_count_per_second_total_recent": 0.09857475565766097, "throughput/batch_count_per_second_total_cum": 0.10176990865734131, "throughput/batch_count_per_second_update_recent": 0.1794463933653363, "throughput/batch_count_per_second_update_cum": 0.1786263061157713, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} -{"step": 335544320, "throughput/token_count": 335544320, "throughput/batch_count": 160, "throughput/flop_count": 0, "throughput/total_time": 1529.7566971820197, "throughput/update_time": 895.4798201125232, "throughput/token_count_per_second_total_recent": 227130.10228160047, "throughput/token_count_per_second_total_cum": 219344.89361485364, "throughput/token_count_per_second_update_recent": 376319.29891578166, "throughput/token_count_per_second_update_cum": 374708.96882727806, "throughput/batch_count_per_second_total_recent": 0.10830407251434349, "throughput/batch_count_per_second_total_cum": 0.10459179573767359, "throughput/batch_count_per_second_update_recent": 0.17944302507199367, "throughput/batch_count_per_second_update_cum": 0.17867515984882262, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} -{"step": 356515840, "throughput/token_count": 356515840, "throughput/batch_count": 170, "throughput/flop_count": 0, "throughput/total_time": 1675.8973924720194, "throughput/update_time": 951.2102684524143, "throughput/token_count_per_second_total_recent": 206711.7491508341, "throughput/token_count_per_second_total_cum": 212731.305389839, "throughput/token_count_per_second_update_recent": 376316.18827154406, "throughput/token_count_per_second_update_cum": 374802.34583678195, "throughput/batch_count_per_second_total_recent": 0.09856784303228097, "throughput/batch_count_per_second_total_cum": 0.10143819112293195, "throughput/batch_count_per_second_update_recent": 0.17944154180123523, "throughput/batch_count_per_second_update_cum": 0.17871968547667597, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} -{"step": 377487360, "throughput/token_count": 377487360, "throughput/batch_count": 180, "throughput/flop_count": 0, "throughput/total_time": 1731.7409806579817, "throughput/update_time": 1006.941321704362, "throughput/token_count_per_second_total_recent": 227089.28041586204, "throughput/token_count_per_second_total_cum": 217981.4211341076, "throughput/token_count_per_second_update_recent": 376315.0390515152, "throughput/token_count_per_second_update_cum": 374885.16149189306, "throughput/batch_count_per_second_total_recent": 0.10828460713189222, "throughput/batch_count_per_second_total_cum": 0.10394164139466648, "throughput/batch_count_per_second_update_recent": 0.17944099381042253, "throughput/batch_count_per_second_update_cum": 0.17875917505831387, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} -{"step": 398458880, "throughput/token_count": 398458880, "throughput/batch_count": 190, "throughput/flop_count": 0, "throughput/total_time": 1877.79561357497, "throughput/update_time": 1062.6691502483445, "throughput/token_count_per_second_total_recent": 206691.61850006302, "throughput/token_count_per_second_total_cum": 212195.02118306112, "throughput/token_count_per_second_update_recent": 376308.86018347496, "throughput/token_count_per_second_update_cum": 374960.42856507184, "throughput/batch_count_per_second_total_recent": 0.09855824398997451, "throughput/batch_count_per_second_total_cum": 0.10118247088578278, "throughput/batch_count_per_second_update_recent": 0.1794380474965453, "throughput/batch_count_per_second_update_cum": 0.1787950651955947, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} -{"step": 419430400, "throughput/token_count": 419430400, "throughput/batch_count": 200, "throughput/flop_count": 0, "throughput/total_time": 1933.6392927800189, "throughput/update_time": 1118.3960400532233, "throughput/token_count_per_second_total_recent": 227193.6836406189, "throughput/token_count_per_second_total_cum": 216912.4311685761, "throughput/token_count_per_second_update_recent": 376307.502138501, "throughput/token_count_per_second_update_cum": 375028.5095609242, "throughput/batch_count_per_second_total_recent": 0.10833439046889252, "throughput/batch_count_per_second_total_cum": 0.10343190725735478, "throughput/batch_count_per_second_update_recent": 0.1794373999302392, "throughput/batch_count_per_second_update_cum": 0.1788275287441846, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} -{"step": 440401920, "throughput/token_count": 440401920, "throughput/batch_count": 210, "throughput/flop_count": 0, "throughput/total_time": 2080.2760781620163, "throughput/update_time": 1174.1330189242726, "throughput/token_count_per_second_total_recent": 206660.15027500596, "throughput/token_count_per_second_total_cum": 211703.59291402693, "throughput/token_count_per_second_update_recent": 376295.86338184687, "throughput/token_count_per_second_update_cum": 375086.9048921657, "throughput/batch_count_per_second_total_recent": 0.0985432387709646, "throughput/batch_count_per_second_total_cum": 0.10094813962651583, "throughput/batch_count_per_second_update_recent": 0.17943185013859123, "throughput/batch_count_per_second_update_cum": 0.1788553738079861, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} -{"step": 461373440, "throughput/token_count": 461373440, "throughput/batch_count": 220, "throughput/flop_count": 0, "throughput/total_time": 2136.1286090469803, "throughput/update_time": 1229.8719967252691, "throughput/token_count_per_second_total_recent": 227055.88273089295, "throughput/token_count_per_second_total_cum": 215985.79694405137, "throughput/token_count_per_second_update_recent": 376290.52742921386, "throughput/token_count_per_second_update_cum": 375139.3976190047, "throughput/batch_count_per_second_total_recent": 0.108268681874701, "throughput/batch_count_per_second_total_cum": 0.10299005362703866, "throughput/batch_count_per_second_update_recent": 0.1794293057581014, "throughput/batch_count_per_second_update_cum": 0.17888040429067836, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} -{"step": 482344960, "throughput/token_count": 482344960, "throughput/batch_count": 230, "throughput/flop_count": 0, "throughput/total_time": 2282.3240791389835, "throughput/update_time": 1285.6013174692634, "throughput/token_count_per_second_total_recent": 206635.936220446, "throughput/token_count_per_second_total_cum": 211339.3818208178, "throughput/token_count_per_second_update_recent": 376288.1136370948, "throughput/token_count_per_second_update_cum": 375190.1568905572, "throughput/batch_count_per_second_total_recent": 0.09853169260999965, "throughput/batch_count_per_second_total_cum": 0.10077447024384394, "throughput/batch_count_per_second_update_recent": 0.17942815477232685, "throughput/batch_count_per_second_update_cum": 0.17890460819747792, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} -{"step": 503316480, "throughput/token_count": 503316480, "throughput/batch_count": 240, "throughput/flop_count": 0, "throughput/total_time": 2338.156839519972, "throughput/update_time": 1341.322897736216, "throughput/token_count_per_second_total_recent": 227031.08804347448, "throughput/token_count_per_second_total_cum": 215262.06946123077, "throughput/token_count_per_second_update_recent": 376298.366257677, "throughput/token_count_per_second_update_cum": 375238.8636990092, "throughput/batch_count_per_second_total_recent": 0.10825685884641384, "throughput/batch_count_per_second_total_cum": 0.10264495347081698, "throughput/batch_count_per_second_update_recent": 0.1794330436027894, "throughput/batch_count_per_second_update_cum": 0.1789278334136053, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} -{"step": 524288000, "throughput/token_count": 524288000, "throughput/batch_count": 250, "throughput/flop_count": 0, "throughput/total_time": 2484.2231900609913, "throughput/update_time": 1397.0756743992679, "throughput/token_count_per_second_total_recent": 206643.3741049392, "throughput/token_count_per_second_total_cum": 211047.05973988108, "throughput/token_count_per_second_update_recent": 376284.8654450764, "throughput/token_count_per_second_update_cum": 375275.3051300818, "throughput/batch_count_per_second_total_recent": 0.09853523926970444, "throughput/batch_count_per_second_total_cum": 0.10063508021349005, "throughput/batch_count_per_second_update_recent": 0.17942660591367549, "throughput/batch_count_per_second_update_cum": 0.1789452100420388, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} -{"step": 545259520, "throughput/token_count": 545259520, "throughput/batch_count": 260, "throughput/flop_count": 0, "throughput/total_time": 2540.0900395850185, "throughput/update_time": 1452.8315300212707, "throughput/token_count_per_second_total_recent": 227042.01399907397, "throughput/token_count_per_second_total_cum": 214661.49290088966, "throughput/token_count_per_second_update_recent": 376270.85829193465, "throughput/token_count_per_second_update_cum": 375308.15427169105, "throughput/batch_count_per_second_total_recent": 0.10826206874803256, "throughput/batch_count_per_second_total_cum": 0.10235857625050052, "throughput/batch_count_per_second_update_recent": 0.17941992678257687, "throughput/batch_count_per_second_update_cum": 0.1789608737333732, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} -{"step": 566231040, "throughput/token_count": 566231040, "throughput/batch_count": 270, "throughput/flop_count": 0, "throughput/total_time": 2686.6876707019983, "throughput/update_time": 1508.5666796893347, "throughput/token_count_per_second_total_recent": 206545.4422012343, "throughput/token_count_per_second_total_cum": 210754.32257149965, "throughput/token_count_per_second_update_recent": 376267.5017963584, "throughput/token_count_per_second_update_cum": 375343.72701152746, "throughput/batch_count_per_second_total_recent": 0.09848854169904438, "throughput/batch_count_per_second_total_cum": 0.10049549225401862, "throughput/batch_count_per_second_update_recent": 0.17941832628076476, "throughput/batch_count_per_second_update_cum": 0.17897783613754628, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} -{"step": 587202560, "throughput/token_count": 587202560, "throughput/batch_count": 280, "throughput/flop_count": 0, "throughput/total_time": 2742.5622889249935, "throughput/update_time": 1564.3145829213317, "throughput/token_count_per_second_total_recent": 226902.08771317446, "throughput/token_count_per_second_total_cum": 214107.282948956, "throughput/token_count_per_second_update_recent": 376256.63965991966, "throughput/token_count_per_second_update_cum": 375373.7045034822, "throughput/batch_count_per_second_total_recent": 0.10819534669550632, "throughput/batch_count_per_second_total_cum": 0.10209430835197258, "throughput/batch_count_per_second_update_recent": 0.1794131468104933, "throughput/batch_count_per_second_update_cum": 0.17899213051962004, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} -{"step": 608174080, "throughput/token_count": 608174080, "throughput/batch_count": 290, "throughput/flop_count": 0, "throughput/total_time": 2889.1227126879967, "throughput/update_time": 1620.060068657389, "throughput/token_count_per_second_total_recent": 206434.77099299317, "throughput/token_count_per_second_total_cum": 210504.75887684393, "throughput/token_count_per_second_update_recent": 376245.0139528033, "throughput/token_count_per_second_update_cum": 375402.1790710632, "throughput/batch_count_per_second_total_recent": 0.09843576955461177, "throughput/batch_count_per_second_total_cum": 0.10037649101106831, "throughput/batch_count_per_second_update_recent": 0.17940760324134983, "throughput/batch_count_per_second_update_cum": 0.1790057082515064, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} -{"step": 629145600, "throughput/token_count": 629145600, "throughput/batch_count": 300, "throughput/flop_count": 0, "throughput/total_time": 2944.979424642981, "throughput/update_time": 1675.8085315313656, "throughput/token_count_per_second_total_recent": 226911.78067480956, "throughput/token_count_per_second_total_cum": 213633.2752396975, "throughput/token_count_per_second_update_recent": 376230.09559799184, "throughput/token_count_per_second_update_cum": 375428.0922684421, "throughput/batch_count_per_second_total_recent": 0.10819996865978697, "throughput/batch_count_per_second_total_cum": 0.10186828386292339, "throughput/batch_count_per_second_update_recent": 0.17940048961543648, "throughput/batch_count_per_second_update_cum": 0.17901806462690453, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} -{"step": 650117120, "throughput/token_count": 650117120, "throughput/batch_count": 310, "throughput/flop_count": 0, "throughput/total_time": 3091.6305087410146, "throughput/update_time": 1731.5482600294054, "throughput/token_count_per_second_total_recent": 206428.5047405374, "throughput/token_count_per_second_total_cum": 210282.92939984705, "throughput/token_count_per_second_update_recent": 376228.5123221606, "throughput/token_count_per_second_update_cum": 375454.2307639521, "throughput/batch_count_per_second_total_recent": 0.09843278157259817, "throughput/batch_count_per_second_total_cum": 0.10027071447365143, "throughput/batch_count_per_second_update_recent": 0.17939973465068845, "throughput/batch_count_per_second_update_cum": 0.17903052843282322, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} -{"step": 671088640, "throughput/token_count": 671088640, "throughput/batch_count": 320, "throughput/flop_count": 0, "throughput/total_time": 3147.4849796229973, "throughput/update_time": 1787.300771905575, "throughput/token_count_per_second_total_recent": 226802.20192292548, "throughput/token_count_per_second_total_cum": 213214.24703999137, "throughput/token_count_per_second_update_recent": 376219.8566879159, "throughput/token_count_per_second_update_cum": 375476.05335866456, "throughput/batch_count_per_second_total_recent": 0.10814771743913912, "throughput/batch_count_per_second_total_cum": 0.10166847564696854, "throughput/batch_count_per_second_update_recent": 0.17939560732265278, "throughput/batch_count_per_second_update_cum": 0.17904093425687054, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} -{"step": 692060160, "throughput/token_count": 692060160, "throughput/batch_count": 330, "throughput/flop_count": 0, "throughput/total_time": 3293.557014766964, "throughput/update_time": 1843.0341175765498, "throughput/token_count_per_second_total_recent": 206452.85739865416, "throughput/token_count_per_second_total_cum": 210125.45308828267, "throughput/token_count_per_second_update_recent": 376215.0221140973, "throughput/token_count_per_second_update_cum": 375500.46057205205, "throughput/batch_count_per_second_total_recent": 0.09844439382488926, "throughput/batch_count_per_second_total_cum": 0.10019562391676076, "throughput/batch_count_per_second_update_recent": 0.179393302018212, "throughput/batch_count_per_second_update_cum": 0.17905257252314188, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} -{"step": 713031680, "throughput/token_count": 713031680, "throughput/batch_count": 340, "throughput/flop_count": 0, "throughput/total_time": 3349.424384585989, "throughput/update_time": 1898.7890577405924, "throughput/token_count_per_second_total_recent": 226790.67716648633, "throughput/token_count_per_second_total_cum": 212881.8561426146, "throughput/token_count_per_second_update_recent": 376193.9149897223, "throughput/token_count_per_second_update_cum": 375519.16422377684, "throughput/batch_count_per_second_total_recent": 0.10814222200702969, "throughput/batch_count_per_second_total_cum": 0.10150997931605081, "throughput/batch_count_per_second_update_recent": 0.17938323735700717, "throughput/batch_count_per_second_update_cum": 0.17906149111927835, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} -{"step": 734003200, "throughput/token_count": 734003200, "throughput/batch_count": 350, "throughput/flop_count": 0, "throughput/total_time": 3496.0525263110176, "throughput/update_time": 1954.5468697096221, "throughput/token_count_per_second_total_recent": 206333.10514380576, "throughput/token_count_per_second_total_cum": 209951.9942781035, "throughput/token_count_per_second_update_recent": 376191.11088553874, "throughput/token_count_per_second_update_cum": 375536.24902791274, "throughput/batch_count_per_second_total_recent": 0.09838729149999893, "throughput/batch_count_per_second_total_cum": 0.10011291231064963, "throughput/batch_count_per_second_update_recent": 0.17938190025593698, "throughput/batch_count_per_second_update_cum": 0.17906963778873097, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} -{"step": 754974720, "throughput/token_count": 754974720, "throughput/batch_count": 360, "throughput/flop_count": 0, "throughput/total_time": 3551.9233703140053, "throughput/update_time": 2010.2989615525585, "throughput/token_count_per_second_total_recent": 226783.12609209985, "throughput/token_count_per_second_total_cum": 212553.77475479068, "throughput/token_count_per_second_update_recent": 376192.5950830513, "throughput/token_count_per_second_update_cum": 375553.45470453374, "throughput/batch_count_per_second_total_recent": 0.10813862137417786, "throughput/batch_count_per_second_total_cum": 0.10135353791942152, "throughput/batch_count_per_second_update_recent": 0.17938260797646108, "throughput/batch_count_per_second_update_cum": 0.1790778420946759, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} -{"step": 775946240, "throughput/token_count": 775946240, "throughput/batch_count": 370, "throughput/flop_count": 0, "throughput/total_time": 3698.38593041501, "throughput/update_time": 2066.043420936505, "throughput/token_count_per_second_total_recent": 206358.2613796437, "throughput/token_count_per_second_total_cum": 209806.7250415178, "throughput/token_count_per_second_update_recent": 376184.8285322082, "throughput/token_count_per_second_update_cum": 375571.1192402122, "throughput/batch_count_per_second_total_recent": 0.09839928692800698, "throughput/batch_count_per_second_total_cum": 0.10004364254070178, "throughput/batch_count_per_second_update_recent": 0.17937890459642802, "throughput/batch_count_per_second_update_cum": 0.1790862652016698, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} -{"step": 796917760, "throughput/token_count": 796917760, "throughput/batch_count": 380, "throughput/flop_count": 0, "throughput/total_time": 3754.2459046120057, "throughput/update_time": 2121.7969342375873, "throughput/token_count_per_second_total_recent": 226809.92110373423, "throughput/token_count_per_second_total_cum": 212271.06061992495, "throughput/token_count_per_second_update_recent": 376183.2312199069, "throughput/token_count_per_second_update_cum": 375586.25292591995, "throughput/batch_count_per_second_total_recent": 0.10815139823137962, "throughput/batch_count_per_second_total_cum": 0.10121872931476829, "throughput/batch_count_per_second_update_recent": 0.1793781429385695, "throughput/batch_count_per_second_update_cum": 0.1790934815053558, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} -{"step": 817889280, "throughput/token_count": 817889280, "throughput/batch_count": 390, "throughput/flop_count": 0, "throughput/total_time": 3902.067099667969, "throughput/update_time": 2177.5441309445887, "throughput/token_count_per_second_total_recent": 206102.64295274275, "throughput/token_count_per_second_total_cum": 209604.1044680126, "throughput/token_count_per_second_update_recent": 376181.9701238888, "throughput/token_count_per_second_update_cum": 375601.701190419, "throughput/batch_count_per_second_total_recent": 0.09827739856373918, "throughput/batch_count_per_second_total_cum": 0.09994702552223807, "throughput/batch_count_per_second_update_recent": 0.179377541601128, "throughput/batch_count_per_second_update_cum": 0.1791008478118987, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} -{"step": 838860800, "throughput/token_count": 838860800, "throughput/batch_count": 400, "throughput/flop_count": 0, "throughput/total_time": 3957.9235017200117, "throughput/update_time": 2233.291398033558, "throughput/token_count_per_second_total_recent": 226522.7246087554, "throughput/token_count_per_second_total_cum": 211944.67241103895, "throughput/token_count_per_second_update_recent": 376182.34367966984, "throughput/token_count_per_second_update_cum": 375616.3663813096, "throughput/batch_count_per_second_total_recent": 0.10801445227086802, "throughput/batch_count_per_second_total_cum": 0.10106309528877208, "throughput/batch_count_per_second_update_recent": 0.17937771972640507, "throughput/batch_count_per_second_update_cum": 0.17910784071984748, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} -{"step": 859832320, "throughput/token_count": 859832320, "throughput/batch_count": 410, "throughput/flop_count": 0, "throughput/total_time": 4104.85487911501, "throughput/update_time": 2289.03639949864, "throughput/token_count_per_second_total_recent": 206045.2895276673, "throughput/token_count_per_second_total_cum": 209467.1663972141, "throughput/token_count_per_second_update_recent": 376179.0003358533, "throughput/token_count_per_second_update_cum": 375630.6890481629, "throughput/batch_count_per_second_total_recent": 0.09825005031951299, "throughput/batch_count_per_second_total_cum": 0.0998817283617087, "throughput/batch_count_per_second_update_recent": 0.1793761254958407, "throughput/batch_count_per_second_update_cum": 0.1791146702996077, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} -{"step": 880803840, "throughput/token_count": 880803840, "throughput/batch_count": 420, "throughput/flop_count": 0, "throughput/total_time": 4160.708882857987, "throughput/update_time": 2344.7830602055765, "throughput/token_count_per_second_total_recent": 226310.30854562361, "throughput/token_count_per_second_total_cum": 211695.61841466223, "throughput/token_count_per_second_update_recent": 376181.6011732201, "throughput/token_count_per_second_update_cum": 375644.06488111377, "throughput/batch_count_per_second_total_recent": 0.10791316439896756, "throughput/batch_count_per_second_total_cum": 0.10094433708890067, "throughput/batch_count_per_second_update_recent": 0.17937736567173962, "throughput/batch_count_per_second_update_cum": 0.1791210483937806, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} -{"step": 901775360, "throughput/token_count": 901775360, "throughput/batch_count": 430, "throughput/flop_count": 0, "throughput/total_time": 4307.030373459973, "throughput/update_time": 2400.5389171724673, "throughput/token_count_per_second_total_recent": 205994.39025761114, "throughput/token_count_per_second_total_cum": 209372.88150015424, "throughput/token_count_per_second_update_recent": 376167.1402040627, "throughput/token_count_per_second_update_cum": 375655.38036024757, "throughput/batch_count_per_second_total_recent": 0.0982257796562248, "throughput/batch_count_per_second_total_cum": 0.09983676981933319, "throughput/batch_count_per_second_update_recent": 0.17937047014430174, "throughput/batch_count_per_second_update_cum": 0.17912644403469447, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} -{"step": 922746880, "throughput/token_count": 922746880, "throughput/batch_count": 440, "throughput/flop_count": 0, "throughput/total_time": 4362.870210377965, "throughput/update_time": 2456.286583611334, "throughput/token_count_per_second_total_recent": 226386.02383375837, "throughput/token_count_per_second_total_cum": 211499.9611505886, "throughput/token_count_per_second_update_recent": 376173.77317735896, "throughput/token_count_per_second_update_cum": 375667.4348004374, "throughput/batch_count_per_second_total_recent": 0.10794926826179427, "throughput/batch_count_per_second_total_cum": 0.10085104043511801, "throughput/batch_count_per_second_update_recent": 0.17937363299243878, "throughput/batch_count_per_second_update_cum": 0.17913219203969832, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} -{"step": 943718400, "throughput/token_count": 943718400, "throughput/batch_count": 450, "throughput/flop_count": 0, "throughput/total_time": 4509.209549701016, "throughput/update_time": 2512.043382478296, "throughput/token_count_per_second_total_recent": 206060.28561857666, "throughput/token_count_per_second_total_cum": 209286.88046058393, "throughput/token_count_per_second_update_recent": 376174.9465690517, "throughput/token_count_per_second_update_cum": 375677.5884455307, "throughput/batch_count_per_second_total_recent": 0.09825720101288636, "throughput/batch_count_per_second_total_cum": 0.09979576132802197, "throughput/batch_count_per_second_update_recent": 0.179374192509199, "throughput/batch_count_per_second_update_cum": 0.17913703367497, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} -{"step": 964689920, "throughput/token_count": 964689920, "throughput/batch_count": 460, "throughput/flop_count": 0, "throughput/total_time": 4565.060955744993, "throughput/update_time": 2567.792047406314, "throughput/token_count_per_second_total_recent": 226428.23360513488, "throughput/token_count_per_second_total_cum": 211320.27137249208, "throughput/token_count_per_second_update_recent": 376175.12999496376, "throughput/token_count_per_second_update_cum": 375688.4911978826, "throughput/batch_count_per_second_total_recent": 0.10796939544922585, "throughput/batch_count_per_second_total_cum": 0.10076535767197231, "throughput/batch_count_per_second_update_recent": 0.17937427997348965, "throughput/batch_count_per_second_update_cum": 0.1791422325124181, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} -{"step": 985661440, "throughput/token_count": 985661440, "throughput/batch_count": 470, "throughput/flop_count": 0, "throughput/total_time": 4711.463092761987, "throughput/update_time": 2623.54389296734, "throughput/token_count_per_second_total_recent": 206075.54762161156, "throughput/token_count_per_second_total_cum": 209204.9583311452, "throughput/token_count_per_second_update_recent": 376171.51957477204, "throughput/token_count_per_second_update_cum": 375698.4751206792, "throughput/batch_count_per_second_total_recent": 0.09826447850304201, "throughput/batch_count_per_second_total_cum": 0.09975669781262646, "throughput/batch_count_per_second_update_recent": 0.1793725583909855, "throughput/batch_count_per_second_update_cum": 0.17914699321779212, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} -{"step": 1006632960, "throughput/token_count": 1006632960, "throughput/batch_count": 480, "throughput/flop_count": 0, "throughput/total_time": 4767.3171031199745, "throughput/update_time": 2679.296893617313, "throughput/token_count_per_second_total_recent": 226780.27699997157, "throughput/token_count_per_second_total_cum": 211152.9269452641, "throughput/token_count_per_second_update_recent": 376171.32178092335, "throughput/token_count_per_second_update_cum": 375707.8815707307, "throughput/batch_count_per_second_total_recent": 0.10813726282118395, "throughput/batch_count_per_second_total_cum": 0.1006855616308518, "throughput/batch_count_per_second_update_recent": 0.1793724640755288, "throughput/batch_count_per_second_update_cum": 0.17915147856270347, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} -{"step": 1027604480, "throughput/token_count": 1027604480, "throughput/batch_count": 490, "throughput/flop_count": 0, "throughput/total_time": 4913.884267903981, "throughput/update_time": 2735.0481502541807, "throughput/token_count_per_second_total_recent": 206334.23086639945, "throughput/token_count_per_second_total_cum": 209122.64595078165, "throughput/token_count_per_second_update_recent": 376169.13636475493, "throughput/token_count_per_second_update_cum": 375717.14410384325, "throughput/batch_count_per_second_total_recent": 0.09838782828636143, "throughput/batch_count_per_second_total_cum": 0.09971744821108897, "throughput/batch_count_per_second_update_recent": 0.17937142198789355, "throughput/batch_count_per_second_update_cum": 0.17915589528267062, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} -{"step": 1048576000, "throughput/token_count": 1048576000, "throughput/batch_count": 500, "throughput/flop_count": 0, "throughput/total_time": 4969.727964510967, "throughput/update_time": 2790.797476610227, "throughput/token_count_per_second_total_recent": 226873.45711823803, "throughput/token_count_per_second_total_cum": 210992.63530879852, "throughput/token_count_per_second_update_recent": 376167.0305867764, "throughput/token_count_per_second_update_cum": 375726.2964396925, "throughput/batch_count_per_second_total_recent": 0.10818169456397916, "throughput/batch_count_per_second_total_cum": 0.10060912862243582, "throughput/batch_count_per_second_update_recent": 0.17937041787470645, "throughput/batch_count_per_second_update_cum": 0.1791602594564879, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} -{"step": 1069547520, "throughput/token_count": 1069547520, "throughput/batch_count": 510, "throughput/flop_count": 0, "throughput/total_time": 5116.911609567003, "throughput/update_time": 2846.5413487541373, "throughput/token_count_per_second_total_recent": 206285.24396647583, "throughput/token_count_per_second_total_cum": 209022.08238271793, "throughput/token_count_per_second_update_recent": 376168.11165575223, "throughput/token_count_per_second_update_cum": 375735.8102203979, "throughput/batch_count_per_second_total_recent": 0.09836446951221267, "throughput/batch_count_per_second_total_cum": 0.0996694957650747, "throughput/batch_count_per_second_update_recent": 0.17937093336856472, "throughput/batch_count_per_second_update_cum": 0.17916479598064322, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} -{"step": 1090519040, "throughput/token_count": 1090519040, "throughput/batch_count": 520, "throughput/flop_count": 0, "throughput/total_time": 5172.765950003988, "throughput/update_time": 2902.285505968146, "throughput/token_count_per_second_total_recent": 226656.96121842013, "throughput/token_count_per_second_total_cum": 210819.32771366916, "throughput/token_count_per_second_update_recent": 376170.0576664534, "throughput/token_count_per_second_update_cum": 375744.92163417395, "throughput/batch_count_per_second_total_recent": 0.10807846127434736, "throughput/batch_count_per_second_total_cum": 0.10052648912127932, "throughput/batch_count_per_second_update_recent": 0.1793718612987773, "throughput/batch_count_per_second_update_cum": 0.1791691406412954, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} -{"step": 1111490560, "throughput/token_count": 1111490560, "throughput/batch_count": 530, "throughput/flop_count": 0, "throughput/total_time": 5319.3806196419755, "throughput/update_time": 2958.0443075241055, "throughput/token_count_per_second_total_recent": 206224.37513269202, "throughput/token_count_per_second_total_cum": 208951.1241018902, "throughput/token_count_per_second_update_recent": 376167.6797363453, "throughput/token_count_per_second_update_cum": 375751.82940052776, "throughput/batch_count_per_second_total_recent": 0.09833544499048806, "throughput/batch_count_per_second_total_cum": 0.09963566022009382, "throughput/batch_count_per_second_update_recent": 0.17937072741334215, "throughput/batch_count_per_second_update_cum": 0.1791724345209731, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} -{"step": 1132462080, "throughput/token_count": 1132462080, "throughput/batch_count": 540, "throughput/flop_count": 0, "throughput/total_time": 5375.2188087760005, "throughput/update_time": 3013.786068893096, "throughput/token_count_per_second_total_recent": 226590.0487287313, "throughput/token_count_per_second_total_cum": 210682.04296187055, "throughput/token_count_per_second_update_recent": 376171.85754434933, "throughput/token_count_per_second_update_cum": 375760.6061321834, "throughput/batch_count_per_second_total_recent": 0.10804655491291584, "throughput/batch_count_per_second_total_cum": 0.10046102665036705, "throughput/batch_count_per_second_update_recent": 0.1793727195474383, "throughput/batch_count_per_second_update_cum": 0.17917661959275408, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} -{"step": 1153433600, "throughput/token_count": 1153433600, "throughput/batch_count": 550, "throughput/flop_count": 0, "throughput/total_time": 5521.543946499005, "throughput/update_time": 3069.52293490601, "throughput/token_count_per_second_total_recent": 206228.7986846977, "throughput/token_count_per_second_total_cum": 208896.9337518987, "throughput/token_count_per_second_update_recent": 376185.5953527084, "throughput/token_count_per_second_update_cum": 375769.6633842935, "throughput/batch_count_per_second_total_recent": 0.09833755430445561, "throughput/batch_count_per_second_total_cum": 0.09960982024760184, "throughput/batch_count_per_second_update_recent": 0.1793792702449362, "throughput/batch_count_per_second_update_cum": 0.17918093842711139, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} -{"step": 1174405120, "throughput/token_count": 1174405120, "throughput/batch_count": 560, "throughput/flop_count": 0, "throughput/total_time": 5577.396705480001, "throughput/update_time": 3125.264993761957, "throughput/token_count_per_second_total_recent": 226609.67466244978, "throughput/token_count_per_second_total_cum": 210565.10447716637, "throughput/token_count_per_second_update_recent": 376190.18007332855, "throughput/token_count_per_second_update_cum": 375777.7731949508, "throughput/batch_count_per_second_total_recent": 0.10805591328737725, "throughput/batch_count_per_second_total_cum": 0.10040526603563613, "throughput/batch_count_per_second_update_recent": 0.17938145641008785, "throughput/batch_count_per_second_update_cum": 0.17918480548617877, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} -{"step": 1195376640, "throughput/token_count": 1195376640, "throughput/batch_count": 570, "throughput/flop_count": 0, "throughput/total_time": 5723.710059185978, "throughput/update_time": 3181.0052098479937, "throughput/token_count_per_second_total_recent": 206245.84843487464, "throughput/token_count_per_second_total_cum": 208846.4697965511, "throughput/token_count_per_second_update_recent": 376197.7049604172, "throughput/token_count_per_second_update_cum": 375785.8164768997, "throughput/batch_count_per_second_total_recent": 0.09834568425887806, "throughput/batch_count_per_second_total_cum": 0.0995857571585422, "throughput/batch_count_per_second_update_recent": 0.179385044555863, "throughput/batch_count_per_second_update_cum": 0.17918864082188593, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} -{"step": 1216348160, "throughput/token_count": 1216348160, "throughput/batch_count": 580, "throughput/flop_count": 0, "throughput/total_time": 5779.562335913011, "throughput/update_time": 3236.749040101946, "throughput/token_count_per_second_total_recent": 226672.6020573746, "throughput/token_count_per_second_total_cum": 210456.79401048462, "throughput/token_count_per_second_update_recent": 376204.1019461585, "throughput/token_count_per_second_update_cum": 375793.16311829025, "throughput/batch_count_per_second_total_recent": 0.10808591940754633, "throughput/batch_count_per_second_total_cum": 0.10035361958049994, "throughput/batch_count_per_second_update_recent": 0.17938809487636495, "throughput/batch_count_per_second_update_cum": 0.17919214397348893, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} -{"step": 1237319680, "throughput/token_count": 1237319680, "throughput/batch_count": 590, "throughput/flop_count": 0, "throughput/total_time": 5925.651880743972, "throughput/update_time": 3292.4961628898745, "throughput/token_count_per_second_total_recent": 206344.170473904, "throughput/token_count_per_second_total_cum": 208807.35232199522, "throughput/token_count_per_second_update_recent": 376206.72185789444, "throughput/token_count_per_second_update_cum": 375799.8851892315, "throughput/batch_count_per_second_total_recent": 0.09839256786055756, "throughput/batch_count_per_second_total_cum": 0.0995671044931389, "throughput/batch_count_per_second_update_recent": 0.17938934414763186, "throughput/batch_count_per_second_update_cum": 0.17919534930669379, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} -{"step": 1258291200, "throughput/token_count": 1258291200, "throughput/batch_count": 600, "throughput/flop_count": 0, "throughput/total_time": 5981.512405745976, "throughput/update_time": 3348.2465934828506, "throughput/token_count_per_second_total_recent": 226941.1853132744, "throughput/token_count_per_second_total_cum": 210363.3854861284, "throughput/token_count_per_second_update_recent": 376206.31798808364, "throughput/token_count_per_second_update_cum": 375806.0121525051, "throughput/batch_count_per_second_total_recent": 0.10821398988403054, "throughput/batch_count_per_second_total_cum": 0.10030907892519397, "throughput/batch_count_per_second_update_recent": 0.179389151567499, "throughput/batch_count_per_second_update_cum": 0.1791982708704496, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} -{"step": 1279262720, "throughput/token_count": 1279262720, "throughput/batch_count": 610, "throughput/flop_count": 0, "throughput/total_time": 6128.563732119976, "throughput/update_time": 3403.9983463209355, "throughput/token_count_per_second_total_recent": 206368.1534528818, "throughput/token_count_per_second_total_cum": 208737.7688993178, "throughput/token_count_per_second_update_recent": 376200.7219350608, "throughput/token_count_per_second_update_cum": 375811.79244186054, "throughput/batch_count_per_second_total_recent": 0.09840400383609857, "throughput/batch_count_per_second_total_cum": 0.09953392453161135, "throughput/batch_count_per_second_update_recent": 0.17938648316147843, "throughput/batch_count_per_second_update_cum": 0.17920102712719943, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} -{"step": 1300234240, "throughput/token_count": 1300234240, "throughput/batch_count": 620, "throughput/flop_count": 0, "throughput/total_time": 6184.421642497007, "throughput/update_time": 3459.7488345169113, "throughput/token_count_per_second_total_recent": 226831.784301176, "throughput/token_count_per_second_total_cum": 210243.4657859163, "throughput/token_count_per_second_update_recent": 376199.1359861768, "throughput/token_count_per_second_update_cum": 375817.523812116, "throughput/batch_count_per_second_total_recent": 0.10816182341631699, "throughput/batch_count_per_second_total_cum": 0.10025189675613227, "throughput/batch_count_per_second_update_recent": 0.17938572692211951, "throughput/batch_count_per_second_update_cum": 0.17920376005750466, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} -{"step": 1321205760, "throughput/token_count": 1321205760, "throughput/batch_count": 630, "throughput/flop_count": 0, "throughput/total_time": 6330.95437408, "throughput/update_time": 3515.502175346832, "throughput/token_count_per_second_total_recent": 206383.2970724615, "throughput/token_count_per_second_total_cum": 208689.82493527993, "throughput/token_count_per_second_update_recent": 376200.3024754239, "throughput/token_count_per_second_update_cum": 375822.7684412264, "throughput/batch_count_per_second_total_recent": 0.09841122487662386, "throughput/batch_count_per_second_total_cum": 0.09951106306804654, "throughput/batch_count_per_second_update_recent": 0.17938628314753718, "throughput/batch_count_per_second_update_cum": 0.17920626089154548, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} -{"step": 1342177280, "throughput/token_count": 1342177280, "throughput/batch_count": 640, "throughput/flop_count": 0, "throughput/total_time": 6386.8215373010025, "throughput/update_time": 3571.2574781817966, "throughput/token_count_per_second_total_recent": 226773.89765190557, "throughput/token_count_per_second_total_cum": 210147.9229005025, "throughput/token_count_per_second_update_recent": 376190.2867261693, "throughput/token_count_per_second_update_cum": 375827.64284005953, "throughput/batch_count_per_second_total_recent": 0.10813422091098097, "throughput/batch_count_per_second_total_cum": 0.10020633835816503, "throughput/batch_count_per_second_update_recent": 0.17938150726612534, "throughput/batch_count_per_second_update_cum": 0.17920858518603303, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} -{"step": 1363148800, "throughput/token_count": 1363148800, "throughput/batch_count": 650, "throughput/flop_count": 0, "throughput/total_time": 6532.972345999966, "throughput/update_time": 3627.0106727198, "throughput/token_count_per_second_total_recent": 206414.41773401998, "throughput/token_count_per_second_total_cum": 208656.753435461, "throughput/token_count_per_second_update_recent": 376178.50880593655, "throughput/token_count_per_second_update_cum": 375832.58584067266, "throughput/batch_count_per_second_total_recent": 0.09842606436444282, "throughput/batch_count_per_second_total_cum": 0.09949529334805536, "throughput/batch_count_per_second_update_recent": 0.17937589111611202, "throughput/batch_count_per_second_update_cum": 0.1792109421923984, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} -{"step": 1384120320, "throughput/token_count": 1384120320, "throughput/batch_count": 660, "throughput/flop_count": 0, "throughput/total_time": 6588.8636812510085, "throughput/update_time": 3682.7761907348176, "throughput/token_count_per_second_total_recent": 226804.35532531203, "throughput/token_count_per_second_total_cum": 210069.65494499364, "throughput/token_count_per_second_update_recent": 376163.1729038516, "throughput/token_count_per_second_update_cum": 375836.12153304083, "throughput/batch_count_per_second_total_recent": 0.10814874426141359, "throughput/batch_count_per_second_total_cum": 0.10016901728868181, "throughput/batch_count_per_second_update_recent": 0.17936857838814335, "throughput/batch_count_per_second_update_cum": 0.1792126281418995, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} -{"step": 1405091840, "throughput/token_count": 1405091840, "throughput/batch_count": 670, "throughput/flop_count": 0, "throughput/total_time": 6734.952371816966, "throughput/update_time": 3738.5206979417126, "throughput/token_count_per_second_total_recent": 206451.62441947602, "throughput/token_count_per_second_total_cum": 208626.84135373213, "throughput/token_count_per_second_update_recent": 376160.05560813, "throughput/token_count_per_second_update_cum": 375841.66399656155, "throughput/batch_count_per_second_total_recent": 0.09844380589460183, "throughput/batch_count_per_second_total_cum": 0.09948103015600783, "throughput/batch_count_per_second_update_recent": 0.17936709194571018, "throughput/batch_count_per_second_update_cum": 0.17921527099445417, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} -{"step": 1426063360, "throughput/token_count": 1426063360, "throughput/batch_count": 680, "throughput/flop_count": 0, "throughput/total_time": 6790.794109267998, "throughput/update_time": 3794.261904676736, "throughput/token_count_per_second_total_recent": 226803.15983162646, "throughput/token_count_per_second_total_cum": 209999.49888831488, "throughput/token_count_per_second_update_recent": 376162.4233401139, "throughput/token_count_per_second_update_cum": 375847.3705366151, "throughput/batch_count_per_second_total_recent": 0.10814817420560191, "throughput/batch_count_per_second_total_cum": 0.10013556427398437, "throughput/batch_count_per_second_update_recent": 0.17936822096830077, "throughput/batch_count_per_second_update_cum": 0.17921799208479647, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} -{"step": 1447034880, "throughput/token_count": 1447034880, "throughput/batch_count": 690, "throughput/flop_count": 0, "throughput/total_time": 6936.7826500849915, "throughput/update_time": 3850.002778201713, "throughput/token_count_per_second_total_recent": 206475.3666236081, "throughput/token_count_per_second_total_cum": 208603.17426584937, "throughput/token_count_per_second_update_recent": 376167.19094877783, "throughput/token_count_per_second_update_cum": 375852.94436485873, "throughput/batch_count_per_second_total_recent": 0.09845512705974965, "throughput/batch_count_per_second_total_cum": 0.09946974480907887, "throughput/batch_count_per_second_update_recent": 0.1793704943412675, "throughput/batch_count_per_second_update_cum": 0.17922064989321648, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} -{"step": 1468006400, "throughput/token_count": 1468006400, "throughput/batch_count": 700, "throughput/flop_count": 0, "throughput/total_time": 6992.639220207988, "throughput/update_time": 3905.7538149688044, "throughput/token_count_per_second_total_recent": 227066.45944653137, "throughput/token_count_per_second_total_cum": 209935.95604898603, "throughput/token_count_per_second_update_recent": 376166.4326185958, "throughput/token_count_per_second_update_cum": 375857.38107042597, "throughput/batch_count_per_second_total_recent": 0.10827372524572915, "throughput/batch_count_per_second_total_cum": 0.10010526468705465, "throughput/batch_count_per_second_update_recent": 0.17937013274125851, "throughput/batch_count_per_second_update_cum": 0.17922276547929095, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} -{"step": 1488977920, "throughput/token_count": 1488977920, "throughput/batch_count": 710, "throughput/flop_count": 0, "throughput/total_time": 7139.175099594984, "throughput/update_time": 3961.498215056723, "throughput/token_count_per_second_total_recent": 206581.7699055105, "throughput/token_count_per_second_total_cum": 208564.4208508728, "throughput/token_count_per_second_update_recent": 376171.06133525714, "throughput/token_count_per_second_update_cum": 375862.32257804513, "throughput/batch_count_per_second_total_recent": 0.09850586409831548, "throughput/batch_count_per_second_total_cum": 0.09945126574081077, "throughput/batch_count_per_second_update_recent": 0.17937233988535745, "throughput/batch_count_per_second_update_cum": 0.1792251217737413, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} -{"step": 1509949440, "throughput/token_count": 1509949440, "throughput/batch_count": 720, "throughput/flop_count": 0, "throughput/total_time": 7195.047693797969, "throughput/update_time": 4017.255656591733, "throughput/token_count_per_second_total_recent": 227063.145696439, "throughput/token_count_per_second_total_cum": 209859.54565687667, "throughput/token_count_per_second_update_recent": 376166.92664516583, "throughput/token_count_per_second_update_cum": 375865.9067471577, "throughput/batch_count_per_second_total_recent": 0.1082721451265521, "throughput/batch_count_per_second_total_cum": 0.10006882937282403, "throughput/batch_count_per_second_update_recent": 0.17937036831148426, "throughput/batch_count_per_second_update_cum": 0.17922683083875546, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} -{"step": 1530920960, "throughput/token_count": 1530920960, "throughput/batch_count": 730, "throughput/flop_count": 0, "throughput/total_time": 7341.3063276839675, "throughput/update_time": 4072.9893345796154, "throughput/token_count_per_second_total_recent": 206635.46888340754, "throughput/token_count_per_second_total_cum": 208535.22406862627, "throughput/token_count_per_second_update_recent": 376181.2955132, "throughput/token_count_per_second_update_cum": 375871.5857668728, "throughput/batch_count_per_second_total_recent": 0.09853146976633431, "throughput/batch_count_per_second_total_cum": 0.09943734363013566, "throughput/batch_count_per_second_update_recent": 0.17937721992168426, "throughput/batch_count_per_second_update_cum": 0.1792295388063778, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} -{"step": 1551892480, "throughput/token_count": 1551892480, "throughput/batch_count": 740, "throughput/flop_count": 0, "throughput/total_time": 7397.135954166006, "throughput/update_time": 4128.71436746855, "throughput/token_count_per_second_total_recent": 227045.5799586799, "throughput/token_count_per_second_total_cum": 209796.39817570028, "throughput/token_count_per_second_update_recent": 376201.0570804632, "throughput/token_count_per_second_update_cum": 375877.89851190796, "throughput/batch_count_per_second_total_recent": 0.10826376913007732, "throughput/batch_count_per_second_total_cum": 0.1000387183073522, "throughput/batch_count_per_second_update_recent": 0.1793866429712597, "throughput/batch_count_per_second_update_cum": 0.17923254895778082, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} -{"step": 1572864000, "throughput/token_count": 1572864000, "throughput/batch_count": 750, "throughput/flop_count": 0, "throughput/total_time": 7543.42272252898, "throughput/update_time": 4184.450604122656, "throughput/token_count_per_second_total_recent": 206615.4910427585, "throughput/token_count_per_second_total_cum": 208508.00198463324, "throughput/token_count_per_second_update_recent": 376212.96799841465, "throughput/token_count_per_second_update_cum": 375883.03670029313, "throughput/batch_count_per_second_total_recent": 0.0985219435895722, "throughput/batch_count_per_second_total_cum": 0.09942436312896406, "throughput/batch_count_per_second_update_recent": 0.17939232253952725, "throughput/batch_count_per_second_update_cum": 0.17923499903692872, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} -{"step": 1593835520, "throughput/token_count": 1593835520, "throughput/batch_count": 760, "throughput/flop_count": 0, "throughput/total_time": 7599.255373338994, "throughput/update_time": 4240.1828456086805, "throughput/token_count_per_second_total_recent": 227012.8788114758, "throughput/token_count_per_second_total_cum": 209735.7493198302, "throughput/token_count_per_second_update_recent": 376234.6153636415, "throughput/token_count_per_second_update_cum": 375888.393975898, "throughput/batch_count_per_second_total_recent": 0.10824817600797453, "throughput/batch_count_per_second_total_cum": 0.10000979867927084, "throughput/batch_count_per_second_update_recent": 0.17940264480764462, "throughput/batch_count_per_second_update_cum": 0.17923755358500384, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} -{"step": 1614807040, "throughput/token_count": 1614807040, "throughput/batch_count": 770, "throughput/flop_count": 0, "throughput/total_time": 7745.557092848001, "throughput/update_time": 4295.924490743666, "throughput/token_count_per_second_total_recent": 206581.74261372964, "throughput/token_count_per_second_total_cum": 208481.71676264075, "throughput/token_count_per_second_update_recent": 376235.952636694, "throughput/token_count_per_second_update_cum": 375892.7894285361, "throughput/batch_count_per_second_total_recent": 0.09850585108458025, "throughput/batch_count_per_second_total_cum": 0.09941182935840642, "throughput/batch_count_per_second_update_recent": 0.1794032824691267, "throughput/batch_count_per_second_update_cum": 0.1792396495001488, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} -{"step": 1635778560, "throughput/token_count": 1635778560, "throughput/batch_count": 780, "throughput/flop_count": 0, "throughput/total_time": 7801.41744280397, "throughput/update_time": 4351.678691691719, "throughput/token_count_per_second_total_recent": 226933.37492564123, "throughput/token_count_per_second_total_cum": 209677.09675744153, "throughput/token_count_per_second_update_recent": 376227.75468975585, "throughput/token_count_per_second_update_cum": 375895.9877076976, "throughput/batch_count_per_second_total_recent": 0.10821026560098707, "throughput/batch_count_per_second_total_cum": 0.09998183095810009, "throughput/batch_count_per_second_update_recent": 0.1793993733834056, "throughput/batch_count_per_second_update_cum": 0.17924117455849534, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} -{"step": 1656750080, "throughput/token_count": 1656750080, "throughput/batch_count": 790, "throughput/flop_count": 0, "throughput/total_time": 7947.527445982967, "throughput/update_time": 4407.426374787698, "throughput/token_count_per_second_total_recent": 206553.68125797206, "throughput/token_count_per_second_total_cum": 208461.07059842808, "throughput/token_count_per_second_update_recent": 376223.76423611894, "throughput/token_count_per_second_update_cum": 375899.66096252814, "throughput/batch_count_per_second_total_recent": 0.09849247038744548, "throughput/batch_count_per_second_total_cum": 0.09940198450013546, "throughput/batch_count_per_second_update_recent": 0.17939747058683345, "throughput/batch_count_per_second_update_cum": 0.17924292610289008, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} -{"step": 1677721600, "throughput/token_count": 1677721600, "throughput/batch_count": 800, "throughput/flop_count": 0, "throughput/total_time": 8003.3749305050005, "throughput/update_time": 4463.172267011658, "throughput/token_count_per_second_total_recent": 227038.5090666218, "throughput/token_count_per_second_total_cum": 209626.76552929383, "throughput/token_count_per_second_update_recent": 376226.4822738887, "throughput/token_count_per_second_update_cum": 375903.39328831865, "throughput/batch_count_per_second_total_recent": 0.10826039746600237, "throughput/batch_count_per_second_total_cum": 0.09995783115830127, "throughput/batch_count_per_second_update_recent": 0.17939876664823948, "throughput/batch_count_per_second_update_cum": 0.17924470581451352, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} -{"step": 1698693120, "throughput/token_count": 1698693120, "throughput/batch_count": 810, "throughput/flop_count": 0, "throughput/total_time": 8150.418857850018, "throughput/update_time": 4518.926882733707, "throughput/token_count_per_second_total_recent": 206452.75952097905, "throughput/token_count_per_second_total_cum": 208417.89233493392, "throughput/token_count_per_second_update_recent": 376219.8815225436, "throughput/token_count_per_second_update_cum": 375906.3078649288, "throughput/batch_count_per_second_total_recent": 0.0984443471531768, "throughput/batch_count_per_second_total_cum": 0.09938139549967476, "throughput/batch_count_per_second_update_recent": 0.17939561916472607, "throughput/batch_count_per_second_update_cum": 0.17924609559294166, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} -{"step": 1719664640, "throughput/token_count": 1719664640, "throughput/batch_count": 820, "throughput/flop_count": 0, "throughput/total_time": 8206.275687849964, "throughput/update_time": 4574.681749307667, "throughput/token_count_per_second_total_recent": 226850.0736600645, "throughput/token_count_per_second_total_cum": 209554.8218720093, "throughput/token_count_per_second_update_recent": 376221.7037055594, "throughput/token_count_per_second_update_cum": 375909.13078494574, "throughput/batch_count_per_second_total_recent": 0.10817054446223473, "throughput/batch_count_per_second_total_cum": 0.09992352574921098, "throughput/batch_count_per_second_update_recent": 0.17939648804929706, "throughput/batch_count_per_second_update_cum": 0.17924744166610038, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} -{"step": 1740636160, "throughput/token_count": 1740636160, "throughput/batch_count": 830, "throughput/flop_count": 0, "throughput/total_time": 8352.533770180016, "throughput/update_time": 4630.437151424645, "throughput/token_count_per_second_total_recent": 206453.81222024956, "throughput/token_count_per_second_total_cum": 208396.18346882606, "throughput/token_count_per_second_update_recent": 376205.242551606, "throughput/token_count_per_second_update_cum": 375911.84224678634, "throughput/batch_count_per_second_total_recent": 0.0984448491193054, "throughput/batch_count_per_second_total_cum": 0.09937104390565207, "throughput/batch_count_per_second_update_recent": 0.17938863875942515, "throughput/batch_count_per_second_update_cum": 0.17924873459185903, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} -{"step": 1761607680, "throughput/token_count": 1761607680, "throughput/batch_count": 840, "throughput/flop_count": 0, "throughput/total_time": 8408.397738418018, "throughput/update_time": 4686.193155970657, "throughput/token_count_per_second_total_recent": 226850.50251147253, "throughput/token_count_per_second_total_cum": 209505.75065582403, "throughput/token_count_per_second_update_recent": 376186.026846002, "throughput/token_count_per_second_update_cum": 375914.4408624181, "throughput/batch_count_per_second_total_recent": 0.10817074895452143, "throughput/batch_count_per_second_total_cum": 0.09990012676993562, "throughput/batch_count_per_second_update_recent": 0.17937947599697207, "throughput/batch_count_per_second_update_cum": 0.17924997370835213, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} -{"step": 1782579200, "throughput/token_count": 1782579200, "throughput/batch_count": 850, "throughput/flop_count": 0, "throughput/total_time": 8554.563296968001, "throughput/update_time": 4741.9453661507, "throughput/token_count_per_second_total_recent": 206473.18883250366, "throughput/token_count_per_second_total_cum": 208377.58025962595, "throughput/token_count_per_second_update_recent": 376174.0919269365, "throughput/token_count_per_second_update_cum": 375917.27916659205, "throughput/batch_count_per_second_total_recent": 0.09845408860802825, "throughput/batch_count_per_second_total_cum": 0.09936217320424363, "throughput/batch_count_per_second_update_recent": 0.17937378498408152, "throughput/batch_count_per_second_update_cum": 0.17925132711724856, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} -{"step": 1803550720, "throughput/token_count": 1803550720, "throughput/batch_count": 860, "throughput/flop_count": 0, "throughput/total_time": 8610.412180389976, "throughput/update_time": 4797.692331016588, "throughput/token_count_per_second_total_recent": 226876.2368375808, "throughput/token_count_per_second_total_cum": 209461.60093329177, "throughput/token_count_per_second_update_recent": 376164.13674917986, "throughput/token_count_per_second_update_cum": 375920.4624982369, "throughput/batch_count_per_second_total_recent": 0.10818302003745117, "throughput/batch_count_per_second_total_cum": 0.09987907454170789, "throughput/batch_count_per_second_update_recent": 0.17936903798541062, "throughput/batch_count_per_second_update_cum": 0.1792528450480637, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} -{"step": 1824522240, "throughput/token_count": 1824522240, "throughput/batch_count": 870, "throughput/flop_count": 0, "throughput/total_time": 8756.645623472985, "throughput/update_time": 4853.441611350689, "throughput/token_count_per_second_total_recent": 206485.62357955208, "throughput/token_count_per_second_total_cum": 208358.5791240886, "throughput/token_count_per_second_update_recent": 376166.2899383543, "throughput/token_count_per_second_update_cum": 375923.3933572026, "throughput/batch_count_per_second_total_recent": 0.09846001795747379, "throughput/batch_count_per_second_total_cum": 0.09935311275677138, "throughput/batch_count_per_second_update_recent": 0.17937006470601763, "throughput/batch_count_per_second_update_cum": 0.179254242590524, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} -{"step": 1845493760, "throughput/token_count": 1845493760, "throughput/batch_count": 880, "throughput/flop_count": 0, "throughput/total_time": 8812.488870778994, "throughput/update_time": 4909.191867132671, "throughput/token_count_per_second_total_recent": 226851.02800930993, "throughput/token_count_per_second_total_cum": 209417.99610316724, "throughput/token_count_per_second_update_recent": 376161.97948176035, "throughput/token_count_per_second_update_cum": 375926.1829539989, "throughput/batch_count_per_second_total_recent": 0.10817099953141686, "throughput/batch_count_per_second_total_cum": 0.09985828213842737, "throughput/batch_count_per_second_update_recent": 0.17936800932014482, "throughput/batch_count_per_second_update_cum": 0.1792555727739329, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} -{"step": 1866465280, "throughput/token_count": 1866465280, "throughput/batch_count": 890, "throughput/flop_count": 0, "throughput/total_time": 8958.696731061966, "throughput/update_time": 4964.9637988246395, "throughput/token_count_per_second_total_recent": 206466.23819932318, "throughput/token_count_per_second_total_cum": 208341.16122365365, "throughput/token_count_per_second_update_recent": 376145.6536197637, "throughput/token_count_per_second_update_cum": 375927.2686825733, "throughput/batch_count_per_second_total_recent": 0.09845077428785476, "throughput/batch_count_per_second_total_cum": 0.0993448072546261, "throughput/batch_count_per_second_update_recent": 0.1793602245425051, "throughput/batch_count_per_second_update_cum": 0.17925609048966087, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} -{"step": 1887436800, "throughput/token_count": 1887436800, "throughput/batch_count": 900, "throughput/flop_count": 0, "throughput/total_time": 9014.539005896018, "throughput/update_time": 5020.712582220614, "throughput/token_count_per_second_total_recent": 227057.5610501043, "throughput/token_count_per_second_total_cum": 209376.9630111434, "throughput/token_count_per_second_update_recent": 376144.61370902834, "throughput/token_count_per_second_update_cum": 375930.0635299869, "throughput/batch_count_per_second_total_recent": 0.1082694821596643, "throughput/batch_count_per_second_total_cum": 0.09983871603543444, "throughput/batch_count_per_second_update_recent": 0.1793597286744253, "throughput/batch_count_per_second_update_cum": 0.1792574231767592, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} -{"step": 1908408320, "throughput/token_count": 1908408320, "throughput/batch_count": 910, "throughput/flop_count": 0, "throughput/total_time": 9161.434383540007, "throughput/update_time": 5076.4567962596775, "throughput/token_count_per_second_total_recent": 206498.71091097692, "throughput/token_count_per_second_total_cum": 208308.90012471878, "throughput/token_count_per_second_update_recent": 376151.2992328279, "throughput/token_count_per_second_update_cum": 375933.1353723154, "throughput/batch_count_per_second_total_recent": 0.09846625848339888, "throughput/batch_count_per_second_total_cum": 0.0993294239638895, "throughput/batch_count_per_second_update_recent": 0.17936291658059497, "throughput/batch_count_per_second_update_cum": 0.17925888794532557, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} -{"step": 1929379840, "throughput/token_count": 1929379840, "throughput/batch_count": 920, "throughput/flop_count": 0, "throughput/total_time": 9217.298647498013, "throughput/update_time": 5132.212492840539, "throughput/token_count_per_second_total_recent": 226898.7257813499, "throughput/token_count_per_second_total_cum": 209321.61512676167, "throughput/token_count_per_second_update_recent": 376150.5847803551, "throughput/token_count_per_second_update_cum": 375935.2993843287, "throughput/batch_count_per_second_total_recent": 0.10819374360148902, "throughput/batch_count_per_second_total_cum": 0.0998123241075333, "throughput/batch_count_per_second_update_recent": 0.17936257590310817, "throughput/batch_count_per_second_update_cum": 0.1792599198266643, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} -{"step": 1950351360, "throughput/token_count": 1950351360, "throughput/batch_count": 930, "throughput/flop_count": 0, "throughput/total_time": 9363.553384587984, "throughput/update_time": 5187.936918072519, "throughput/token_count_per_second_total_recent": 206497.91363317418, "throughput/token_count_per_second_total_cum": 208291.79691656338, "throughput/token_count_per_second_update_recent": 376172.6466329814, "throughput/token_count_per_second_update_cum": 375939.6829220924, "throughput/batch_count_per_second_total_recent": 0.09846587831171712, "throughput/batch_count_per_second_total_cum": 0.09932126851871652, "throughput/batch_count_per_second_update_recent": 0.1793730958142192, "throughput/batch_count_per_second_update_cum": 0.17926201006035442, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} -{"step": 1971322880, "throughput/token_count": 1971322880, "throughput/batch_count": 940, "throughput/flop_count": 0, "throughput/total_time": 9419.395047847007, "throughput/update_time": 5243.674068749533, "throughput/token_count_per_second_total_recent": 226882.622994051, "throughput/token_count_per_second_total_cum": 209283.38497179665, "throughput/token_count_per_second_update_recent": 376183.9020408124, "throughput/token_count_per_second_update_cum": 375943.0609443093, "throughput/batch_count_per_second_total_recent": 0.10818606519415426, "throughput/batch_count_per_second_total_cum": 0.09979409454908211, "throughput/batch_count_per_second_update_recent": 0.17937846281090375, "throughput/batch_count_per_second_update_cum": 0.17926362082686867, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} -{"step": 1992294400, "throughput/token_count": 1992294400, "throughput/batch_count": 950, "throughput/flop_count": 0, "throughput/total_time": 9565.561509469, "throughput/update_time": 5299.418035702605, "throughput/token_count_per_second_total_recent": 206503.13186719557, "throughput/token_count_per_second_total_cum": 208277.8306352238, "throughput/token_count_per_second_update_recent": 376191.0271170028, "throughput/token_count_per_second_update_cum": 375945.88435517874, "throughput/batch_count_per_second_total_recent": 0.09846836655959872, "throughput/batch_count_per_second_total_cum": 0.09931460887681189, "throughput/batch_count_per_second_update_recent": 0.17938186031198636, "throughput/batch_count_per_second_update_cum": 0.1792649671340841, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} -{"step": 2013265920, "throughput/token_count": 2013265920, "throughput/batch_count": 960, "throughput/flop_count": 0, "throughput/total_time": 9621.420487532974, "throughput/update_time": 5355.161261588568, "throughput/token_count_per_second_total_recent": 226896.5008206212, "throughput/token_count_per_second_total_cum": 209248.30409487913, "throughput/token_count_per_second_update_recent": 376193.17917170195, "throughput/token_count_per_second_update_cum": 375948.70101125207, "throughput/batch_count_per_second_total_recent": 0.10819268265753804, "throughput/batch_count_per_second_total_cum": 0.09977736668342549, "throughput/batch_count_per_second_update_recent": 0.1793828864916334, "throughput/batch_count_per_second_update_cum": 0.17926631022036174, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} -{"step": 2034237440, "throughput/token_count": 2034237440, "throughput/batch_count": 970, "throughput/flop_count": 0, "throughput/total_time": 9767.543393198983, "throughput/update_time": 5410.899587089545, "throughput/token_count_per_second_total_recent": 206522.4955250733, "throughput/token_count_per_second_total_cum": 208265.00155775237, "throughput/token_count_per_second_update_recent": 376200.0688895108, "throughput/token_count_per_second_update_cum": 375951.8001135539, "throughput/batch_count_per_second_total_recent": 0.09847759987119355, "throughput/batch_count_per_second_total_cum": 0.09930849149596804, "throughput/batch_count_per_second_update_recent": 0.1793861717650942, "throughput/batch_count_per_second_update_cum": 0.17926778798749632, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} -{"step": 2055208960, "throughput/token_count": 2055208960, "throughput/batch_count": 980, "throughput/flop_count": 0, "throughput/total_time": 9823.39123856998, "throughput/update_time": 5466.646158660587, "throughput/token_count_per_second_total_recent": 226909.77535019378, "throughput/token_count_per_second_total_cum": 209215.83087626088, "throughput/token_count_per_second_update_recent": 376202.1042071718, "throughput/token_count_per_second_update_cum": 375954.26891568885, "throughput/batch_count_per_second_total_recent": 0.1081990124464959, "throughput/batch_count_per_second_total_cum": 0.09976188224614185, "throughput/batch_count_per_second_update_recent": 0.1793871422801837, "throughput/batch_count_per_second_update_cum": 0.17926896520409052, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 20971520, "throughput/token_count": 20971520, "throughput/batch_count": 10, "throughput/flop_count": 0, "throughput/total_time": 69.9843489350751, "throughput/update_time": 69.80232589994557, "throughput/token_count_per_second_total_recent": 309942.6919457509, "throughput/token_count_per_second_total_cum": 299660.14286216197, "throughput/token_count_per_second_update_recent": 310597.44024901144, "throughput/token_count_per_second_update_cum": 300441.56451262825, "throughput/batch_count_per_second_total_recent": 0.14779219243323846, "throughput/batch_count_per_second_total_cum": 0.14288909094913577, "throughput/batch_count_per_second_update_recent": 0.14810440075350353, "throughput/batch_count_per_second_update_cum": 0.1432617018283025, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 41943040, "throughput/token_count": 41943040, "throughput/batch_count": 20, "throughput/flop_count": 0, "throughput/total_time": 137.6325364280492, "throughput/update_time": 137.34443728171755, "throughput/token_count_per_second_total_recent": 309977.3777010182, "throughput/token_count_per_second_total_cum": 304746.5453194402, "throughput/token_count_per_second_update_recent": 310543.76632496435, "throughput/token_count_per_second_update_cum": 305385.7937760338, "throughput/batch_count_per_second_total_recent": 0.14780873189021024, "throughput/batch_count_per_second_total_cum": 0.1453144766423417, "throughput/batch_count_per_second_update_recent": 0.1480788070320913, "throughput/batch_count_per_second_update_cum": 0.14561929405976953, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 62914560, "throughput/token_count": 62914560, "throughput/batch_count": 30, "throughput/flop_count": 0, "throughput/total_time": 306.0479498610366, "throughput/update_time": 204.88854807184543, "throughput/token_count_per_second_total_recent": 204800.05753859566, "throughput/token_count_per_second_total_cum": 205570.92451874563, "throughput/token_count_per_second_update_recent": 310523.9426066149, "throughput/token_count_per_second_update_cum": 307067.23529486195, "throughput/batch_count_per_second_total_recent": 0.09765627743654044, "throughput/batch_count_per_second_total_cum": 0.09802385545670778, "throughput/batch_count_per_second_update_recent": 0.14806935434656854, "throughput/batch_count_per_second_update_cum": 0.1464210678552923, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 83886080, "throughput/token_count": 83886080, "throughput/batch_count": 40, "throughput/flop_count": 0, "throughput/total_time": 373.6941461900715, "throughput/update_time": 272.43619964295067, "throughput/token_count_per_second_total_recent": 224321.32952981937, "throughput/token_count_per_second_total_cum": 224477.9075488465, "throughput/token_count_per_second_update_recent": 310510.11172381276, "throughput/token_count_per_second_update_cum": 307910.9167942417, "throughput/batch_count_per_second_total_recent": 0.10696474529734581, "throughput/batch_count_per_second_total_cum": 0.10703940751497579, "throughput/batch_count_per_second_update_recent": 0.14806275926771773, "throughput/batch_count_per_second_update_cum": 0.1468233665438851, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 104857600, "throughput/token_count": 104857600, "throughput/batch_count": 50, "throughput/flop_count": 0, "throughput/total_time": 541.7261677470524, "throughput/update_time": 339.9664637759561, "throughput/token_count_per_second_total_recent": 192927.32035630688, "throughput/token_count_per_second_total_cum": 193561.9991112577, "throughput/token_count_per_second_update_recent": 310518.2407009215, "throughput/token_count_per_second_update_cum": 308435.1286752302, "throughput/batch_count_per_second_total_recent": 0.09199491517844528, "throughput/batch_count_per_second_total_cum": 0.09229755359232793, "throughput/batch_count_per_second_update_recent": 0.14806663546606136, "throughput/batch_count_per_second_update_cum": 0.14707333024751196, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 125829120, "throughput/token_count": 125829120, "throughput/batch_count": 60, "throughput/flop_count": 0, "throughput/total_time": 609.3696456589969, "throughput/update_time": 407.4915405898355, "throughput/token_count_per_second_total_recent": 206123.20192867855, "throughput/token_count_per_second_total_cum": 206490.6266604785, "throughput/token_count_per_second_update_recent": 310527.65688149043, "throughput/token_count_per_second_update_cum": 308789.52681536647, "throughput/batch_count_per_second_total_recent": 0.09828720184739997, "throughput/batch_count_per_second_total_cum": 0.09846240361236501, "throughput/batch_count_per_second_update_recent": 0.14807112545084497, "throughput/batch_count_per_second_update_cum": 0.147242320449527, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 146800640, "throughput/token_count": 146800640, "throughput/batch_count": 70, "throughput/flop_count": 0, "throughput/total_time": 776.8270515420008, "throughput/update_time": 474.9853536799783, "throughput/token_count_per_second_total_recent": 188480.04928788898, "throughput/token_count_per_second_total_cum": 188974.67551960362, "throughput/token_count_per_second_update_recent": 310555.1798782961, "throughput/token_count_per_second_update_cum": 309063.5087222227, "throughput/batch_count_per_second_total_recent": 0.08987429108042191, "throughput/batch_count_per_second_total_cum": 0.09011014724712545, "throughput/batch_count_per_second_update_recent": 0.14808424943842702, "throughput/batch_count_per_second_update_cum": 0.1473729652033914, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 167772160, "throughput/token_count": 167772160, "throughput/batch_count": 80, "throughput/flop_count": 0, "throughput/total_time": 844.4406513640424, "throughput/update_time": 542.4893449847586, "throughput/token_count_per_second_total_recent": 198329.4292922265, "throughput/token_count_per_second_total_cum": 198678.45031974025, "throughput/token_count_per_second_update_recent": 310569.8124871841, "throughput/token_count_per_second_update_cum": 309263.51190310216, "throughput/batch_count_per_second_total_recent": 0.0945708414517529, "throughput/batch_count_per_second_total_cum": 0.0947372676466657, "throughput/batch_count_per_second_update_recent": 0.14809122681006628, "throughput/batch_count_per_second_update_cum": 0.14746833415179356, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 188743680, "throughput/token_count": 188743680, "throughput/batch_count": 90, "throughput/flop_count": 0, "throughput/total_time": 1012.7395372070605, "throughput/update_time": 610.0022338308627, "throughput/token_count_per_second_total_recent": 185967.46706593057, "throughput/token_count_per_second_total_cum": 186369.41984166877, "throughput/token_count_per_second_update_recent": 310576.5595545708, "throughput/token_count_per_second_update_cum": 309414.73577018664, "throughput/batch_count_per_second_total_recent": 0.08867619851395157, "throughput/batch_count_per_second_total_cum": 0.08886786453326644, "throughput/batch_count_per_second_update_recent": 0.14809444406250516, "throughput/batch_count_per_second_update_cum": 0.1475404433108266, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 209715200, "throughput/token_count": 209715200, "throughput/batch_count": 100, "throughput/flop_count": 0, "throughput/total_time": 1080.3595280270092, "throughput/update_time": 677.5131666237721, "throughput/token_count_per_second_total_recent": 193805.25760311782, "throughput/token_count_per_second_total_cum": 194116.12019842071, "throughput/token_count_per_second_update_recent": 310582.85259026854, "throughput/token_count_per_second_update_cum": 309536.7150502277, "throughput/batch_count_per_second_total_recent": 0.09241354828029529, "throughput/batch_count_per_second_total_cum": 0.09256177911683117, "throughput/batch_count_per_second_update_recent": 0.14809744481576373, "throughput/batch_count_per_second_update_cum": 0.14759860756408105, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 230686720, "throughput/token_count": 230686720, "throughput/batch_count": 110, "throughput/flop_count": 0, "throughput/total_time": 1248.9534793440253, "throughput/update_time": 745.0211975647835, "throughput/token_count_per_second_total_recent": 177119.43361774256, "throughput/token_count_per_second_total_cum": 184704.01325208778, "throughput/token_count_per_second_update_recent": 310589.4519441678, "throughput/token_count_per_second_update_cum": 309637.79386953695, "throughput/batch_count_per_second_total_recent": 0.0844571273888314, "throughput/batch_count_per_second_total_cum": 0.08807373678783788, "throughput/batch_count_per_second_update_recent": 0.14810059163292302, "throughput/batch_count_per_second_update_cum": 0.147646805701035, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 251658240, "throughput/token_count": 251658240, "throughput/batch_count": 120, "throughput/flop_count": 0, "throughput/total_time": 1316.5735572939739, "throughput/update_time": 812.5169301189017, "throughput/token_count_per_second_total_recent": 193779.58384615908, "throughput/token_count_per_second_total_cum": 191146.35760818943, "throughput/token_count_per_second_update_recent": 310609.00699722907, "throughput/token_count_per_second_update_cum": 309726.76466344274, "throughput/batch_count_per_second_total_recent": 0.09240130607898668, "throughput/batch_count_per_second_total_cum": 0.09114568596276733, "throughput/batch_count_per_second_update_recent": 0.14810991620885328, "throughput/batch_count_per_second_update_cum": 0.1476892302815641, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 272629760, "throughput/token_count": 272629760, "throughput/batch_count": 130, "throughput/flop_count": 0, "throughput/total_time": 1485.0034688690212, "throughput/update_time": 880.0041609148029, "throughput/token_count_per_second_total_recent": 177120.86096703142, "throughput/token_count_per_second_total_cum": 183588.63512125993, "throughput/token_count_per_second_update_recent": 310636.92323318, "throughput/token_count_per_second_update_cum": 309805.08059937967, "throughput/batch_count_per_second_total_recent": 0.08445780800201007, "throughput/batch_count_per_second_total_cum": 0.08754188304961201, "throughput/batch_count_per_second_update_recent": 0.14812322770747186, "throughput/batch_count_per_second_update_cum": 0.1477265742298983, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 293601280, "throughput/token_count": 293601280, "throughput/batch_count": 140, "throughput/flop_count": 0, "throughput/total_time": 1552.605858018971, "throughput/update_time": 947.4958640788682, "throughput/token_count_per_second_total_recent": 193716.35380611845, "throughput/token_count_per_second_total_cum": 189102.2621637001, "throughput/token_count_per_second_update_recent": 310660.307710199, "throughput/token_count_per_second_update_cum": 309870.7774154052, "throughput/batch_count_per_second_total_recent": 0.09237115564638064, "throughput/batch_count_per_second_total_cum": 0.09017098529992108, "throughput/batch_count_per_second_update_recent": 0.14813437829503964, "throughput/batch_count_per_second_update_cum": 0.1477579009129549, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 314572800, "throughput/token_count": 314572800, "throughput/batch_count": 150, "throughput/flop_count": 0, "throughput/total_time": 1720.6480030510575, "throughput/update_time": 1014.9766604538308, "throughput/token_count_per_second_total_recent": 177126.3777854906, "throughput/token_count_per_second_total_cum": 182822.2852333532, "throughput/token_count_per_second_update_recent": 310686.5492480094, "throughput/token_count_per_second_update_cum": 309931.0676359235, "throughput/batch_count_per_second_total_recent": 0.08446043862604646, "throughput/batch_count_per_second_total_cum": 0.08717645894687329, "throughput/batch_count_per_second_update_recent": 0.14814689123535604, "throughput/batch_count_per_second_update_cum": 0.1477866495303743, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 335544320, "throughput/token_count": 335544320, "throughput/batch_count": 160, "throughput/flop_count": 0, "throughput/total_time": 1788.244720379007, "throughput/update_time": 1082.4689192509977, "throughput/token_count_per_second_total_recent": 193628.89651439173, "throughput/token_count_per_second_total_cum": 187638.92669504625, "throughput/token_count_per_second_update_recent": 310700.48820661905, "throughput/token_count_per_second_update_cum": 309980.5583629839, "throughput/batch_count_per_second_total_recent": 0.09232945275992953, "throughput/batch_count_per_second_total_cum": 0.08947321257355034, "throughput/batch_count_per_second_update_recent": 0.14815353784876778, "throughput/batch_count_per_second_update_cum": 0.1478102485480232, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 356515840, "throughput/token_count": 356515840, "throughput/batch_count": 170, "throughput/flop_count": 0, "throughput/total_time": 1956.1968261280563, "throughput/update_time": 1149.9545339199249, "throughput/token_count_per_second_total_recent": 177058.05415288036, "throughput/token_count_per_second_total_cum": 182249.4726697107, "throughput/token_count_per_second_update_recent": 310703.2023496938, "throughput/token_count_per_second_update_cum": 310026.0310159579, "throughput/batch_count_per_second_total_recent": 0.08442785937923449, "throughput/batch_count_per_second_total_cum": 0.08690332063184295, "throughput/batch_count_per_second_update_recent": 0.1481548320530385, "throughput/batch_count_per_second_update_cum": 0.14783193159864325, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 377487360, "throughput/token_count": 377487360, "throughput/batch_count": 180, "throughput/flop_count": 0, "throughput/total_time": 2023.8116153230658, "throughput/update_time": 1217.4519352857023, "throughput/token_count_per_second_total_recent": 193687.22827011504, "throughput/token_count_per_second_total_cum": 186522.97335478076, "throughput/token_count_per_second_update_recent": 310707.7461973368, "throughput/token_count_per_second_update_cum": 310063.46046130697, "throughput/batch_count_per_second_total_recent": 0.09235726750856163, "throughput/batch_count_per_second_total_cum": 0.08894108455409086, "throughput/batch_count_per_second_update_recent": 0.1481569987284359, "throughput/batch_count_per_second_update_cum": 0.14784977934899662, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 398458880, "throughput/token_count": 398458880, "throughput/batch_count": 190, "throughput/flop_count": 0, "throughput/total_time": 2191.784361746977, "throughput/update_time": 1284.9417869256577, "throughput/token_count_per_second_total_recent": 177107.74272583754, "throughput/token_count_per_second_total_cum": 181796.5703899838, "throughput/token_count_per_second_update_recent": 310720.2476640607, "throughput/token_count_per_second_update_cum": 310098.7796134717, "throughput/batch_count_per_second_total_recent": 0.08445155273715856, "throughput/batch_count_per_second_total_cum": 0.08668735999583425, "throughput/batch_count_per_second_update_recent": 0.14816295989230188, "throughput/batch_count_per_second_update_cum": 0.147866620833145, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 419430400, "throughput/token_count": 419430400, "throughput/batch_count": 200, "throughput/flop_count": 0, "throughput/total_time": 2259.3751640570117, "throughput/update_time": 1352.4327959185466, "throughput/token_count_per_second_total_recent": 193804.4577172068, "throughput/token_count_per_second_total_cum": 185639.99758537504, "throughput/token_count_per_second_update_recent": 310726.90868936176, "throughput/token_count_per_second_update_cum": 310130.30833456747, "throughput/batch_count_per_second_total_recent": 0.09241316686497059, "throughput/batch_count_per_second_total_cum": 0.08852004889744522, "throughput/batch_count_per_second_update_recent": 0.14816613611667717, "throughput/batch_count_per_second_update_cum": 0.14788165489891408, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 440401920, "throughput/token_count": 440401920, "throughput/batch_count": 210, "throughput/flop_count": 0, "throughput/total_time": 2427.812098467024, "throughput/update_time": 1419.9271084357752, "throughput/token_count_per_second_total_recent": 177134.59283689596, "throughput/token_count_per_second_total_cum": 181398.68413955093, "throughput/token_count_per_second_update_recent": 310732.60218958725, "throughput/token_count_per_second_update_cum": 310158.1182467578, "throughput/batch_count_per_second_total_recent": 0.08446435586781309, "throughput/batch_count_per_second_total_cum": 0.08649763304688975, "throughput/batch_count_per_second_update_recent": 0.14816885098914492, "throughput/batch_count_per_second_update_cum": 0.1478949156984128, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 461373440, "throughput/token_count": 461373440, "throughput/batch_count": 220, "throughput/flop_count": 0, "throughput/total_time": 2495.436698611011, "throughput/update_time": 1487.4421038717264, "throughput/token_count_per_second_total_recent": 193804.23981794587, "throughput/token_count_per_second_total_cum": 184886.8537746543, "throughput/token_count_per_second_update_recent": 310724.339813112, "throughput/token_count_per_second_update_cum": 310179.09120568214, "throughput/batch_count_per_second_total_recent": 0.09241306296250623, "throughput/batch_count_per_second_total_cum": 0.08816092194302287, "throughput/batch_count_per_second_update_recent": 0.14816491118102645, "throughput/batch_count_per_second_update_cum": 0.1479049163845454, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 482344960, "throughput/token_count": 482344960, "throughput/batch_count": 230, "throughput/flop_count": 0, "throughput/total_time": 2663.784864959074, "throughput/update_time": 1554.9646949897287, "throughput/token_count_per_second_total_recent": 177146.4426055361, "throughput/token_count_per_second_total_cum": 181075.04338846472, "throughput/token_count_per_second_update_recent": 310706.39257208345, "throughput/token_count_per_second_update_cum": 310196.7276518687, "throughput/batch_count_per_second_total_recent": 0.08447000627781683, "throughput/batch_count_per_second_total_cum": 0.08634330911086308, "throughput/batch_count_per_second_update_recent": 0.14815635326961682, "throughput/batch_count_per_second_update_cum": 0.1479133260974258, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 503316480, "throughput/token_count": 503316480, "throughput/batch_count": 240, "throughput/flop_count": 0, "throughput/total_time": 2731.4342436430743, "throughput/update_time": 1622.4905407206388, "throughput/token_count_per_second_total_recent": 193740.5528374761, "throughput/token_count_per_second_total_cum": 184268.20311394252, "throughput/token_count_per_second_update_recent": 310689.6047829585, "throughput/token_count_per_second_update_cum": 310212.2738887889, "throughput/batch_count_per_second_total_recent": 0.09238269464372449, "throughput/batch_count_per_second_total_cum": 0.08786592632004858, "throughput/batch_count_per_second_update_recent": 0.14814834822795797, "throughput/batch_count_per_second_update_cum": 0.14792073912085957, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 524288000, "throughput/token_count": 524288000, "throughput/batch_count": 250, "throughput/flop_count": 0, "throughput/total_time": 2900.0770314050606, "throughput/update_time": 1690.0012620057678, "throughput/token_count_per_second_total_recent": 177048.5996066313, "throughput/token_count_per_second_total_cum": 180784.16342823394, "throughput/token_count_per_second_update_recent": 310677.41498082376, "throughput/token_count_per_second_update_cum": 310229.35413536435, "throughput/batch_count_per_second_total_recent": 0.08442335110026898, "throughput/batch_count_per_second_total_cum": 0.08620460673724839, "throughput/batch_count_per_second_update_recent": 0.14814253567734897, "throughput/batch_count_per_second_update_cum": 0.14792888361709802, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 545259520, "throughput/token_count": 545259520, "throughput/batch_count": 260, "throughput/flop_count": 0, "throughput/total_time": 2967.7298853070242, "throughput/update_time": 1757.5320481728995, "throughput/token_count_per_second_total_recent": 193606.9159292375, "throughput/token_count_per_second_total_cum": 183729.4973169671, "throughput/token_count_per_second_update_recent": 310661.59836184484, "throughput/token_count_per_second_update_cum": 310241.5802698122, "throughput/batch_count_per_second_total_recent": 0.09231897160016894, "throughput/batch_count_per_second_total_cum": 0.08760905137871128, "throughput/batch_count_per_second_update_recent": 0.14813499372570268, "throughput/batch_count_per_second_update_cum": 0.14793471349230394, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 566231040, "throughput/token_count": 566231040, "throughput/batch_count": 270, "throughput/flop_count": 0, "throughput/total_time": 3136.192894882057, "throughput/update_time": 1825.0568903158419, "throughput/token_count_per_second_total_recent": 176963.1508015356, "throughput/token_count_per_second_total_cum": 180547.2619123749, "throughput/token_count_per_second_update_recent": 310641.3579933702, "throughput/token_count_per_second_update_cum": 310253.91208600014, "throughput/batch_count_per_second_total_recent": 0.0843826059348753, "throughput/batch_count_per_second_total_cum": 0.08609164329165216, "throughput/batch_count_per_second_update_recent": 0.14812534236591826, "throughput/batch_count_per_second_update_cum": 0.14794059376049049, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 587202560, "throughput/token_count": 587202560, "throughput/batch_count": 280, "throughput/flop_count": 0, "throughput/total_time": 3203.840811592061, "throughput/update_time": 1892.5856667858316, "throughput/token_count_per_second_total_recent": 193511.88840305162, "throughput/token_count_per_second_total_cum": 183280.8165360144, "throughput/token_count_per_second_update_recent": 310627.77496514656, "throughput/token_count_per_second_update_cum": 310264.7189531151, "throughput/batch_count_per_second_total_recent": 0.0922736589446314, "throughput/batch_count_per_second_total_cum": 0.08739510371018143, "throughput/batch_count_per_second_update_recent": 0.1481188654733403, "throughput/batch_count_per_second_update_cum": 0.14794574687629466, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 608174080, "throughput/token_count": 608174080, "throughput/batch_count": 290, "throughput/flop_count": 0, "throughput/total_time": 3371.6769031099975, "throughput/update_time": 1960.1043883458478, "throughput/token_count_per_second_total_recent": 176978.07145291092, "throughput/token_count_per_second_total_cum": 180377.3307694539, "throughput/token_count_per_second_update_recent": 310612.69735290913, "throughput/token_count_per_second_update_cum": 310276.37283810397, "throughput/batch_count_per_second_total_recent": 0.08438972065587565, "throughput/batch_count_per_second_total_cum": 0.0860106138083715, "throughput/batch_count_per_second_update_recent": 0.14811167590756852, "throughput/batch_count_per_second_update_cum": 0.14795130388169478, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 629145600, "throughput/token_count": 629145600, "throughput/batch_count": 300, "throughput/flop_count": 0, "throughput/total_time": 3439.312252484029, "throughput/update_time": 2027.637110557058, "throughput/token_count_per_second_total_recent": 193613.34704280557, "throughput/token_count_per_second_total_cum": 182927.7349114208, "throughput/token_count_per_second_update_recent": 310593.03430739546, "throughput/token_count_per_second_update_cum": 310285.1080818664, "throughput/batch_count_per_second_total_recent": 0.09232203819408683, "throughput/batch_count_per_second_total_cum": 0.08722674127169647, "throughput/batch_count_per_second_update_recent": 0.14810229983682416, "throughput/batch_count_per_second_update_cum": 0.14795546917050667, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 650117120, "throughput/token_count": 650117120, "throughput/batch_count": 310, "throughput/flop_count": 0, "throughput/total_time": 3607.989511896041, "throughput/update_time": 2095.1480140229687, "throughput/token_count_per_second_total_recent": 176935.21674326094, "throughput/token_count_per_second_total_cum": 180188.19562985806, "throughput/token_count_per_second_update_recent": 310585.47992597806, "throughput/token_count_per_second_update_cum": 310296.5115823425, "throughput/batch_count_per_second_total_recent": 0.08436928593791053, "throughput/batch_count_per_second_total_cum": 0.08592042714589027, "throughput/batch_count_per_second_update_recent": 0.1480986976270571, "throughput/batch_count_per_second_update_cum": 0.14796090678326726, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 671088640, "throughput/token_count": 671088640, "throughput/batch_count": 320, "throughput/flop_count": 0, "throughput/total_time": 3675.6837699849857, "throughput/update_time": 2162.6913318177685, "throughput/token_count_per_second_total_recent": 193535.3780044755, "throughput/token_count_per_second_total_cum": 182575.18382837958, "throughput/token_count_per_second_update_recent": 310574.3196292505, "throughput/token_count_per_second_update_cum": 310302.5522536967, "throughput/batch_count_per_second_total_recent": 0.09228485965942168, "throughput/batch_count_per_second_total_cum": 0.08705863181513766, "throughput/batch_count_per_second_update_recent": 0.14809337598288083, "throughput/batch_count_per_second_update_cum": 0.14796378719982944, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 692060160, "throughput/token_count": 692060160, "throughput/batch_count": 330, "throughput/flop_count": 0, "throughput/total_time": 3843.81031891203, "throughput/update_time": 2230.2011163331335, "throughput/token_count_per_second_total_recent": 176959.54994621358, "throughput/token_count_per_second_total_cum": 180045.3463051954, "throughput/token_count_per_second_update_recent": 310581.20623943297, "throughput/token_count_per_second_update_cum": 310312.8928290898, "throughput/batch_count_per_second_total_recent": 0.08438088891325644, "throughput/batch_count_per_second_total_cum": 0.08585231127986688, "throughput/batch_count_per_second_update_recent": 0.1480966597745099, "throughput/batch_count_per_second_update_cum": 0.14796871797041405, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 713031680, "throughput/token_count": 713031680, "throughput/batch_count": 340, "throughput/flop_count": 0, "throughput/total_time": 3911.4375108770328, "throughput/update_time": 2297.7100352901034, "throughput/token_count_per_second_total_recent": 193631.92521355467, "throughput/token_count_per_second_total_cum": 182294.02310970888, "throughput/token_count_per_second_update_recent": 310586.6112290743, "throughput/token_count_per_second_update_cum": 310322.7426649483, "throughput/batch_count_per_second_total_recent": 0.09233089695623144, "throughput/batch_count_per_second_total_cum": 0.08692456393704838, "throughput/batch_count_per_second_update_recent": 0.14809923707441058, "throughput/batch_count_per_second_update_cum": 0.1479734147381536, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 734003200, "throughput/token_count": 734003200, "throughput/batch_count": 350, "throughput/flop_count": 0, "throughput/total_time": 4079.354542599991, "throughput/update_time": 2365.248016706202, "throughput/token_count_per_second_total_recent": 177072.11835618448, "throughput/token_count_per_second_total_cum": 179931.21027724657, "throughput/token_count_per_second_update_recent": 310574.96614351164, "throughput/token_count_per_second_update_cum": 310328.2170899602, "throughput/batch_count_per_second_total_recent": 0.08443456571397041, "throughput/batch_count_per_second_total_cum": 0.08579788698065118, "throughput/batch_count_per_second_update_recent": 0.14809368426490385, "throughput/batch_count_per_second_update_cum": 0.14797602514741906, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 754974720, "throughput/token_count": 754974720, "throughput/batch_count": 360, "throughput/flop_count": 0, "throughput/total_time": 4147.001440979075, "throughput/update_time": 2432.7737647151807, "throughput/token_count_per_second_total_recent": 193731.21135467, "throughput/token_count_per_second_total_cum": 182053.16075842894, "throughput/token_count_per_second_update_recent": 310576.8378526738, "throughput/token_count_per_second_update_cum": 310334.94809509726, "throughput/batch_count_per_second_total_recent": 0.09237824027760982, "throughput/batch_count_per_second_total_cum": 0.08680971181794593, "throughput/batch_count_per_second_update_recent": 0.1480945767653817, "throughput/batch_count_per_second_update_cum": 0.14797923474078048, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 775946240, "throughput/token_count": 775946240, "throughput/batch_count": 370, "throughput/flop_count": 0, "throughput/total_time": 4315.090891462984, "throughput/update_time": 2500.2843339033425, "throughput/token_count_per_second_total_recent": 177128.88709032987, "throughput/token_count_per_second_total_cum": 179821.52856505048, "throughput/token_count_per_second_update_recent": 310584.244497998, "throughput/token_count_per_second_update_cum": 310343.199562677, "throughput/batch_count_per_second_total_recent": 0.08446163515583509, "throughput/batch_count_per_second_total_cum": 0.08574558666470074, "throughput/batch_count_per_second_update_recent": 0.14809810852908992, "throughput/batch_count_per_second_update_cum": 0.1479831693471322, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 796917760, "throughput/token_count": 796917760, "throughput/batch_count": 380, "throughput/flop_count": 0, "throughput/total_time": 4382.723289367976, "throughput/update_time": 2567.801604798413, "throughput/token_count_per_second_total_recent": 193690.68338169353, "throughput/token_count_per_second_total_cum": 181831.6392306214, "throughput/token_count_per_second_update_recent": 310588.68014506757, "throughput/token_count_per_second_update_cum": 310350.20716195967, "throughput/batch_count_per_second_total_recent": 0.09235891503414799, "throughput/batch_count_per_second_total_cum": 0.08670408212214537, "throughput/batch_count_per_second_update_recent": 0.14810022361043337, "throughput/batch_count_per_second_update_cum": 0.14798651083085998, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 817889280, "throughput/token_count": 817889280, "throughput/batch_count": 390, "throughput/flop_count": 0, "throughput/total_time": 4550.521985811065, "throughput/update_time": 2635.3166619893163, "throughput/token_count_per_second_total_recent": 177137.27524221846, "throughput/token_count_per_second_total_cum": 179735.26609699987, "throughput/token_count_per_second_update_recent": 310591.29091642407, "throughput/token_count_per_second_update_cum": 310357.11639397504, "throughput/batch_count_per_second_total_recent": 0.08446563493834422, "throughput/batch_count_per_second_total_cum": 0.0857044535145759, "throughput/batch_count_per_second_update_recent": 0.14810146852322773, "throughput/batch_count_per_second_update_cum": 0.14798980540941956, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 838860800, "throughput/token_count": 838860800, "throughput/batch_count": 400, "throughput/flop_count": 0, "throughput/total_time": 4618.154446462984, "throughput/update_time": 2702.834149704431, "throughput/token_count_per_second_total_recent": 193852.26992857043, "throughput/token_count_per_second_total_cum": 181644.1632094999, "throughput/token_count_per_second_update_recent": 310597.76668616955, "throughput/token_count_per_second_update_cum": 310363.4013547349, "throughput/batch_count_per_second_total_recent": 0.09243596550396463, "throughput/batch_count_per_second_total_cum": 0.08661468658900257, "throughput/batch_count_per_second_update_recent": 0.14810455641087034, "throughput/batch_count_per_second_update_cum": 0.1479928023122477, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 859832320, "throughput/token_count": 859832320, "throughput/batch_count": 410, "throughput/flop_count": 0, "throughput/total_time": 4786.292578452965, "throughput/update_time": 2770.3491651542718, "throughput/token_count_per_second_total_recent": 177219.60651563347, "throughput/token_count_per_second_total_cum": 179644.74714120312, "throughput/token_count_per_second_update_recent": 310597.83277748537, "throughput/token_count_per_second_update_cum": 310369.65694254596, "throughput/batch_count_per_second_total_recent": 0.08450489354879068, "throughput/batch_count_per_second_total_cum": 0.08566129071293026, "throughput/batch_count_per_second_update_recent": 0.14810458792566555, "throughput/batch_count_per_second_update_cum": 0.14799578520896242, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 880803840, "throughput/token_count": 880803840, "throughput/batch_count": 420, "throughput/flop_count": 0, "throughput/total_time": 4853.921031603008, "throughput/update_time": 2837.8658607284306, "throughput/token_count_per_second_total_recent": 193860.01442125667, "throughput/token_count_per_second_total_cum": 181462.3341140584, "throughput/token_count_per_second_update_recent": 310608.93188227335, "throughput/token_count_per_second_update_cum": 310375.4311255265, "throughput/batch_count_per_second_total_recent": 0.09243965836584886, "throughput/batch_count_per_second_total_cum": 0.0865279837198536, "throughput/batch_count_per_second_update_recent": 0.14810988039125125, "throughput/batch_count_per_second_update_cum": 0.1479985385539658, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 901775360, "throughput/token_count": 901775360, "throughput/batch_count": 430, "throughput/flop_count": 0, "throughput/total_time": 5021.374068334, "throughput/update_time": 2905.373775637592, "throughput/token_count_per_second_total_recent": 177330.96132216405, "throughput/token_count_per_second_total_cum": 179587.36945865347, "throughput/token_count_per_second_update_recent": 310610.1565426891, "throughput/token_count_per_second_update_cum": 310381.8749799595, "throughput/batch_count_per_second_total_recent": 0.08455799165828898, "throughput/batch_count_per_second_total_cum": 0.085633930901839, "throughput/batch_count_per_second_update_recent": 0.14811046435484365, "throughput/batch_count_per_second_update_cum": 0.14800161122320152, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 922746880, "throughput/token_count": 922746880, "throughput/batch_count": 440, "throughput/flop_count": 0, "throughput/total_time": 5089.012363151065, "throughput/update_time": 2972.8906648436096, "throughput/token_count_per_second_total_recent": 193937.16985188195, "throughput/token_count_per_second_total_cum": 181321.40662135166, "throughput/token_count_per_second_update_recent": 310605.2773276056, "throughput/token_count_per_second_update_cum": 310387.0892098689, "throughput/batch_count_per_second_total_recent": 0.09247644894212816, "throughput/batch_count_per_second_total_cum": 0.08646078425471862, "throughput/batch_count_per_second_update_recent": 0.14810813776378898, "throughput/batch_count_per_second_update_cum": 0.14800409756177374, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 943718400, "throughput/token_count": 943718400, "throughput/batch_count": 450, "throughput/flop_count": 0, "throughput/total_time": 5256.981477431022, "throughput/update_time": 3040.4158616837813, "throughput/token_count_per_second_total_recent": 177321.7054775452, "throughput/token_count_per_second_total_cum": 179517.1628531542, "throughput/token_count_per_second_update_recent": 310611.4183139714, "throughput/token_count_per_second_update_cum": 310391.22374442854, "throughput/batch_count_per_second_total_recent": 0.08455357812764416, "throughput/batch_count_per_second_total_cum": 0.08560045378358565, "throughput/batch_count_per_second_update_recent": 0.1481110660142762, "throughput/batch_count_per_second_update_cum": 0.14800606906148364, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 964689920, "throughput/token_count": 964689920, "throughput/batch_count": 460, "throughput/flop_count": 0, "throughput/total_time": 5324.629123619059, "throughput/update_time": 3107.9427982217167, "throughput/token_count_per_second_total_recent": 193963.0485329481, "throughput/token_count_per_second_total_cum": 181175.04479717018, "throughput/token_count_per_second_update_recent": 310609.8500090887, "throughput/token_count_per_second_update_cum": 310395.0048733105, "throughput/batch_count_per_second_total_recent": 0.09248878885886579, "throughput/batch_count_per_second_total_cum": 0.0863909934984065, "throughput/batch_count_per_second_update_recent": 0.14811031818823275, "throughput/batch_count_per_second_update_cum": 0.14800787204423452, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 985661440, "throughput/token_count": 985661440, "throughput/batch_count": 470, "throughput/flop_count": 0, "throughput/total_time": 5492.093443386024, "throughput/update_time": 3175.472754184855, "throughput/token_count_per_second_total_recent": 177415.99837706194, "throughput/token_count_per_second_total_cum": 179469.1678429115, "throughput/token_count_per_second_update_recent": 310601.34425370983, "throughput/token_count_per_second_update_cum": 310398.3300442518, "throughput/batch_count_per_second_total_recent": 0.08459854048588845, "throughput/batch_count_per_second_total_cum": 0.08557756797929358, "throughput/batch_count_per_second_update_recent": 0.14810626232800952, "throughput/batch_count_per_second_update_cum": 0.1480094576092967, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 1006632960, "throughput/token_count": 1006632960, "throughput/batch_count": 480, "throughput/flop_count": 0, "throughput/total_time": 5559.744059987017, "throughput/update_time": 3242.99941329984, "throughput/token_count_per_second_total_recent": 194019.7185735889, "throughput/token_count_per_second_total_cum": 181057.4280288634, "throughput/token_count_per_second_update_recent": 310597.63029383216, "throughput/token_count_per_second_update_cum": 310401.8322888698, "throughput/batch_count_per_second_total_recent": 0.0925158112400002, "throughput/batch_count_per_second_total_cum": 0.08633490945285005, "throughput/batch_count_per_second_update_recent": 0.14810449137393578, "throughput/batch_count_per_second_update_cum": 0.14801112760966767, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 1027604480, "throughput/token_count": 1027604480, "throughput/batch_count": 490, "throughput/flop_count": 0, "throughput/total_time": 5727.743233548012, "throughput/update_time": 3310.5120258319657, "throughput/token_count_per_second_total_recent": 177382.86696346218, "throughput/token_count_per_second_total_cum": 179408.26571645346, "throughput/token_count_per_second_update_recent": 310598.7435019416, "throughput/token_count_per_second_update_cum": 310406.50871574844, "throughput/batch_count_per_second_total_recent": 0.08458274219678029, "throughput/batch_count_per_second_total_cum": 0.0855485275823848, "throughput/batch_count_per_second_update_recent": 0.14810502219292718, "throughput/batch_count_per_second_update_cum": 0.14801335750377104, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 1048576000, "throughput/token_count": 1048576000, "throughput/batch_count": 500, "throughput/flop_count": 0, "throughput/total_time": 5795.387424343033, "throughput/update_time": 3378.042265718919, "throughput/token_count_per_second_total_recent": 194044.50299244947, "throughput/token_count_per_second_total_cum": 180932.8562911162, "throughput/token_count_per_second_update_recent": 310592.5640752415, "throughput/token_count_per_second_update_cum": 310409.3784264244, "throughput/batch_count_per_second_total_recent": 0.09252762937185739, "throughput/batch_count_per_second_total_cum": 0.08627550901943025, "throughput/batch_count_per_second_update_recent": 0.14810207561266017, "throughput/batch_count_per_second_update_cum": 0.14801472588845463, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 1069547520, "throughput/token_count": 1069547520, "throughput/batch_count": 510, "throughput/flop_count": 0, "throughput/total_time": 5963.751827322063, "throughput/update_time": 3445.5543855928117, "throughput/token_count_per_second_total_recent": 177347.32652597054, "throughput/token_count_per_second_total_cum": 179341.38625621932, "throughput/token_count_per_second_update_recent": 310594.9776497016, "throughput/token_count_per_second_update_cum": 310413.7680926441, "throughput/batch_count_per_second_total_recent": 0.08456579519556548, "throughput/batch_count_per_second_total_cum": 0.0855166369706246, "throughput/batch_count_per_second_update_recent": 0.14810322649464683, "throughput/batch_count_per_second_update_cum": 0.1480168190444203, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 1090519040, "throughput/token_count": 1090519040, "throughput/batch_count": 520, "throughput/flop_count": 0, "throughput/total_time": 6031.410010787076, "throughput/update_time": 3513.0946629439713, "throughput/token_count_per_second_total_recent": 193874.28274122826, "throughput/token_count_per_second_total_cum": 180806.65019450258, "throughput/token_count_per_second_update_recent": 310583.5776683292, "throughput/token_count_per_second_update_cum": 310415.5010403693, "throughput/batch_count_per_second_total_recent": 0.09244646203099645, "throughput/batch_count_per_second_total_cum": 0.08621532926297311, "throughput/batch_count_per_second_update_recent": 0.14809779055992564, "throughput/batch_count_per_second_update_cum": 0.14801764537828888, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 1111490560, "throughput/token_count": 1111490560, "throughput/batch_count": 530, "throughput/flop_count": 0, "throughput/total_time": 6198.96821509907, "throughput/update_time": 3580.623477048124, "throughput/token_count_per_second_total_recent": 177326.58815892736, "throughput/token_count_per_second_total_cum": 179302.50993910548, "throughput/token_count_per_second_update_recent": 310574.15917881706, "throughput/token_count_per_second_update_cum": 310418.1624023523, "throughput/batch_count_per_second_total_recent": 0.08455590637155884, "throughput/batch_count_per_second_total_cum": 0.08549809929805063, "throughput/batch_count_per_second_update_recent": 0.14809329947415212, "throughput/batch_count_per_second_update_cum": 0.14801891441457382, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 1132462080, "throughput/token_count": 1132462080, "throughput/batch_count": 540, "throughput/flop_count": 0, "throughput/total_time": 6266.654250354972, "throughput/update_time": 3648.1618733102223, "throughput/token_count_per_second_total_recent": 193938.0367305587, "throughput/token_count_per_second_total_cum": 180712.3920927746, "throughput/token_count_per_second_update_recent": 310563.4160866158, "throughput/token_count_per_second_update_cum": 310419.9098962791, "throughput/batch_count_per_second_total_recent": 0.09247686230209289, "throughput/batch_count_per_second_total_cum": 0.08617038349760751, "throughput/batch_count_per_second_update_recent": 0.14808817676859656, "throughput/batch_count_per_second_update_cum": 0.14801974768461185, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 1153433600, "throughput/token_count": 1153433600, "throughput/batch_count": 550, "throughput/flop_count": 0, "throughput/total_time": 6434.099346054019, "throughput/update_time": 3715.6762271750486, "throughput/token_count_per_second_total_recent": 177398.71415366398, "throughput/token_count_per_second_total_cum": 179268.85146829314, "throughput/token_count_per_second_update_recent": 310568.34484203305, "throughput/token_count_per_second_update_cum": 310423.6024560546, "throughput/batch_count_per_second_total_recent": 0.08459029872592162, "throughput/batch_count_per_second_total_cum": 0.08548204968847901, "throughput/batch_count_per_second_update_recent": 0.1480905269823232, "throughput/batch_count_per_second_update_cum": 0.14802150843432169, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 1174405120, "throughput/token_count": 1174405120, "throughput/batch_count": 560, "throughput/flop_count": 0, "throughput/total_time": 6501.729716927046, "throughput/update_time": 3783.1945504179457, "throughput/token_count_per_second_total_recent": 193941.08879734448, "throughput/token_count_per_second_total_cum": 180629.6433612849, "throughput/token_count_per_second_update_recent": 310570.83682747884, "throughput/token_count_per_second_update_cum": 310426.83751758374, "throughput/batch_count_per_second_total_recent": 0.09247831764094566, "throughput/batch_count_per_second_total_cum": 0.08613092582763905, "throughput/batch_count_per_second_update_recent": 0.14809171525358145, "throughput/batch_count_per_second_update_cum": 0.14802305103186786, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 1195376640, "throughput/token_count": 1195376640, "throughput/batch_count": 570, "throughput/flop_count": 0, "throughput/total_time": 6669.622392687015, "throughput/update_time": 3850.7147646707017, "throughput/token_count_per_second_total_recent": 177336.46039858743, "throughput/token_count_per_second_total_cum": 179227.03409876465, "throughput/token_count_per_second_update_recent": 310577.0332568007, "throughput/token_count_per_second_update_cum": 310429.8066860904, "throughput/batch_count_per_second_total_recent": 0.08456061382226344, "throughput/batch_count_per_second_total_cum": 0.08546210961282952, "throughput/batch_count_per_second_update_recent": 0.1480946699413303, "throughput/batch_count_per_second_update_cum": 0.1480244668417408, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 1216348160, "throughput/token_count": 1216348160, "throughput/batch_count": 580, "throughput/flop_count": 0, "throughput/total_time": 6737.260854291031, "throughput/update_time": 3918.229618334677, "throughput/token_count_per_second_total_recent": 193966.30251861684, "throughput/token_count_per_second_total_cum": 180540.45795559412, "throughput/token_count_per_second_update_recent": 310581.6692495121, "throughput/token_count_per_second_update_cum": 310433.0982309739, "throughput/batch_count_per_second_total_recent": 0.0924903404801449, "throughput/batch_count_per_second_total_cum": 0.08608839891223627, "throughput/batch_count_per_second_update_recent": 0.14809688055492023, "throughput/batch_count_per_second_update_cum": 0.14802603637264913, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 1237319680, "throughput/token_count": 1237319680, "throughput/batch_count": 590, "throughput/flop_count": 0, "throughput/total_time": 6905.097716190037, "throughput/update_time": 3985.7243089615367, "throughput/token_count_per_second_total_recent": 177362.44055041834, "throughput/token_count_per_second_total_cum": 179189.3077919692, "throughput/token_count_per_second_update_recent": 310590.44141750847, "throughput/token_count_per_second_update_cum": 310437.84870368475, "throughput/batch_count_per_second_total_recent": 0.08457300212403218, "throughput/batch_count_per_second_total_cum": 0.08544412030790767, "throughput/batch_count_per_second_update_recent": 0.1481010634505789, "throughput/batch_count_per_second_update_cum": 0.1480283015745567, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 1258291200, "throughput/token_count": 1258291200, "throughput/batch_count": 600, "throughput/flop_count": 0, "throughput/total_time": 6972.698619629024, "throughput/update_time": 4053.2279500714503, "throughput/token_count_per_second_total_recent": 194070.7803932207, "throughput/token_count_per_second_total_cum": 180459.71418551664, "throughput/token_count_per_second_update_recent": 310603.192308802, "throughput/token_count_per_second_update_cum": 310441.7554353978, "throughput/batch_count_per_second_total_recent": 0.0925401594129661, "throughput/batch_count_per_second_total_cum": 0.08604989728236992, "throughput/batch_count_per_second_update_recent": 0.14810714354934787, "throughput/batch_count_per_second_update_cum": 0.14803016444940462, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 1279262720, "throughput/token_count": 1279262720, "throughput/batch_count": 610, "throughput/flop_count": 0, "throughput/total_time": 7141.385740551981, "throughput/update_time": 4120.727246251656, "throughput/token_count_per_second_total_recent": 177320.13762849654, "throughput/token_count_per_second_total_cum": 179133.68167970178, "throughput/token_count_per_second_update_recent": 310609.9600387903, "throughput/token_count_per_second_update_cum": 310445.86150749435, "throughput/batch_count_per_second_total_recent": 0.0845528305189593, "throughput/batch_count_per_second_total_cum": 0.08541759571061219, "throughput/batch_count_per_second_update_recent": 0.148110370654483, "throughput/batch_count_per_second_update_cum": 0.14803212237715452, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 1300234240, "throughput/token_count": 1300234240, "throughput/batch_count": 620, "throughput/flop_count": 0, "throughput/total_time": 7208.982709518052, "throughput/update_time": 4188.228603066411, "throughput/token_count_per_second_total_recent": 193873.65671452935, "throughput/token_count_per_second_total_cum": 180363.06818759532, "throughput/token_count_per_second_update_recent": 310625.6553389502, "throughput/token_count_per_second_update_cum": 310449.6824858208, "throughput/batch_count_per_second_total_recent": 0.09244616351820438, "throughput/batch_count_per_second_total_cum": 0.08600381287936941, "throughput/batch_count_per_second_update_recent": 0.1481178547568084, "throughput/batch_count_per_second_update_cum": 0.14803394436160125, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 1321205760, "throughput/token_count": 1321205760, "throughput/batch_count": 630, "throughput/flop_count": 0, "throughput/total_time": 7376.684962157044, "throughput/update_time": 4255.729584518704, "throughput/token_count_per_second_total_recent": 177308.2365446704, "throughput/token_count_per_second_total_cum": 179105.62356639688, "throughput/token_count_per_second_update_recent": 310640.89265418646, "throughput/token_count_per_second_update_cum": 310453.4096353822, "throughput/batch_count_per_second_total_recent": 0.0845471556399681, "throughput/batch_count_per_second_total_cum": 0.08540421655959934, "throughput/batch_count_per_second_update_recent": 0.14812512047490428, "throughput/batch_count_per_second_update_cum": 0.1480357216050063, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 1342177280, "throughput/token_count": 1342177280, "throughput/batch_count": 640, "throughput/flop_count": 0, "throughput/total_time": 7444.285434685065, "throughput/update_time": 4323.2266572538065, "throughput/token_count_per_second_total_recent": 193846.87347387933, "throughput/token_count_per_second_total_cum": 180296.32149063097, "throughput/token_count_per_second_update_recent": 310657.9321675783, "throughput/token_count_per_second_update_cum": 310457.3010873725, "throughput/batch_count_per_second_total_recent": 0.09243339227384535, "throughput/batch_count_per_second_total_cum": 0.08597198557406949, "throughput/batch_count_per_second_update_recent": 0.14813324554804722, "throughput/batch_count_per_second_update_cum": 0.14803757719391467, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 1363148800, "throughput/token_count": 1363148800, "throughput/batch_count": 650, "throughput/flop_count": 0, "throughput/total_time": 7612.232262870064, "throughput/update_time": 4390.719008370885, "throughput/token_count_per_second_total_recent": 177245.48213425136, "throughput/token_count_per_second_total_cum": 179073.46398887306, "throughput/token_count_per_second_update_recent": 310669.9409660194, "throughput/token_count_per_second_update_cum": 310461.40675391967, "throughput/batch_count_per_second_total_recent": 0.08451723200523918, "throughput/batch_count_per_second_total_cum": 0.08538888167804387, "throughput/batch_count_per_second_update_recent": 0.1481389717893693, "throughput/batch_count_per_second_update_cum": 0.14803953492828353, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 1384120320, "throughput/token_count": 1384120320, "throughput/batch_count": 660, "throughput/flop_count": 0, "throughput/total_time": 7679.831617571064, "throughput/update_time": 4458.218750593718, "throughput/token_count_per_second_total_recent": 193842.68809028092, "throughput/token_count_per_second_total_cum": 180227.95146096734, "throughput/token_count_per_second_update_recent": 310677.42524575855, "throughput/token_count_per_second_update_cum": 310464.87340166327, "throughput/batch_count_per_second_total_recent": 0.0924313965274243, "throughput/batch_count_per_second_total_cum": 0.08593938420341841, "throughput/batch_count_per_second_update_recent": 0.1481425405720513, "throughput/batch_count_per_second_update_cum": 0.14804118795474208, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 1405091840, "throughput/token_count": 1405091840, "throughput/batch_count": 670, "throughput/flop_count": 0, "throughput/total_time": 7847.450622586999, "throughput/update_time": 4525.717398178764, "throughput/token_count_per_second_total_recent": 177291.55559284295, "throughput/token_count_per_second_total_cum": 179050.7398613992, "throughput/token_count_per_second_update_recent": 310688.8057537914, "throughput/token_count_per_second_update_cum": 310468.31173449673, "throughput/batch_count_per_second_total_recent": 0.08453920154230259, "throughput/batch_count_per_second_total_cum": 0.08537804596967659, "throughput/batch_count_per_second_update_recent": 0.1481479672211606, "throughput/batch_count_per_second_update_cum": 0.14804282747959935, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 1426063360, "throughput/token_count": 1426063360, "throughput/batch_count": 680, "throughput/flop_count": 0, "throughput/total_time": 7915.0508393970085, "throughput/update_time": 4593.218119100784, "throughput/token_count_per_second_total_recent": 193892.4230768426, "throughput/token_count_per_second_total_cum": 180171.09288822225, "throughput/token_count_per_second_update_recent": 310693.0316657784, "throughput/token_count_per_second_update_cum": 310471.50886864064, "throughput/batch_count_per_second_total_recent": 0.09245511201707964, "throughput/batch_count_per_second_total_cum": 0.08591227192317116, "throughput/batch_count_per_second_update_recent": 0.1481499822930233, "throughput/batch_count_per_second_update_cum": 0.14804435199195892, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 1447034880, "throughput/token_count": 1447034880, "throughput/batch_count": 690, "throughput/flop_count": 0, "throughput/total_time": 8083.225679086056, "throughput/update_time": 4660.71875171666, "throughput/token_count_per_second_total_recent": 177245.07904849347, "throughput/token_count_per_second_total_cum": 179017.00848758334, "throughput/token_count_per_second_update_recent": 310691.29950205266, "throughput/token_count_per_second_update_cum": 310474.6192777714, "throughput/batch_count_per_second_total_recent": 0.08451703979897188, "throughput/batch_count_per_second_total_cum": 0.08536196159724395, "throughput/batch_count_per_second_update_recent": 0.14814915633299477, "throughput/batch_count_per_second_update_cum": 0.14804583515060968, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 1468006400, "throughput/token_count": 1468006400, "throughput/batch_count": 700, "throughput/flop_count": 0, "throughput/total_time": 8150.834610218997, "throughput/update_time": 4728.229743778473, "throughput/token_count_per_second_total_recent": 193982.88623133142, "throughput/token_count_per_second_total_cum": 180105.04079662065, "throughput/token_count_per_second_update_recent": 310688.26315260347, "throughput/token_count_per_second_update_cum": 310476.96062815917, "throughput/batch_count_per_second_total_recent": 0.09249824821058818, "throughput/batch_count_per_second_total_cum": 0.08588077583151849, "throughput/batch_count_per_second_update_recent": 0.1481477084887521, "throughput/batch_count_per_second_update_cum": 0.14804695159347495, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 1488977920, "throughput/token_count": 1488977920, "throughput/batch_count": 710, "throughput/flop_count": 0, "throughput/total_time": 8318.88102039101, "throughput/update_time": 4795.734360322589, "throughput/token_count_per_second_total_recent": 177340.2325130586, "throughput/token_count_per_second_total_cum": 178987.76486287743, "throughput/token_count_per_second_update_recent": 310684.99350694986, "throughput/token_count_per_second_update_cum": 310479.6488143773, "throughput/batch_count_per_second_total_recent": 0.0845624125066083, "throughput/batch_count_per_second_total_cum": 0.08534801715034362, "throughput/batch_count_per_second_update_recent": 0.14814614940021031, "throughput/batch_count_per_second_update_cum": 0.14804823342055193, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 1509949440, "throughput/token_count": 1509949440, "throughput/batch_count": 720, "throughput/flop_count": 0, "throughput/total_time": 8386.480246171006, "throughput/update_time": 4863.235517622321, "throughput/token_count_per_second_total_recent": 193920.12260342535, "throughput/token_count_per_second_total_cum": 180045.66822766842, "throughput/token_count_per_second_update_recent": 310685.7506145613, "throughput/token_count_per_second_update_cum": 310482.48322101164, "throughput/batch_count_per_second_total_recent": 0.09246832018061893, "throughput/batch_count_per_second_total_cum": 0.08585246478446408, "throughput/batch_count_per_second_update_recent": 0.14814651041725221, "throughput/batch_count_per_second_update_cum": 0.14804958497095663, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 1530920960, "throughput/token_count": 1530920960, "throughput/batch_count": 730, "throughput/flop_count": 0, "throughput/total_time": 8554.17625815398, "throughput/update_time": 4930.740559646045, "throughput/token_count_per_second_total_recent": 177341.7692117982, "throughput/token_count_per_second_total_cum": 178967.66606144002, "throughput/token_count_per_second_update_recent": 310684.0402843226, "throughput/token_count_per_second_update_cum": 310484.9954039962, "throughput/batch_count_per_second_total_recent": 0.0845631452616683, "throughput/batch_count_per_second_total_cum": 0.08533843329498292, "throughput/batch_count_per_second_update_recent": 0.14814569486824158, "throughput/batch_count_per_second_update_cum": 0.14805078287315188, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 1551892480, "throughput/token_count": 1551892480, "throughput/batch_count": 740, "throughput/flop_count": 0, "throughput/total_time": 8621.791595970979, "throughput/update_time": 4998.250562901143, "throughput/token_count_per_second_total_recent": 193966.1169426344, "throughput/token_count_per_second_total_cum": 179996.51960100842, "throughput/token_count_per_second_update_recent": 310677.0782682617, "throughput/token_count_per_second_update_cum": 310487.1315413272, "throughput/batch_count_per_second_total_recent": 0.0924902519906208, "throughput/batch_count_per_second_total_cum": 0.08582902889299794, "throughput/batch_count_per_second_update_recent": 0.14814237512028774, "throughput/batch_count_per_second_update_cum": 0.14805180146280633, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 1572864000, "throughput/token_count": 1572864000, "throughput/batch_count": 750, "throughput/flop_count": 0, "throughput/total_time": 8789.583836077014, "throughput/update_time": 5065.748505146126, "throughput/token_count_per_second_total_recent": 177362.86108743795, "throughput/token_count_per_second_total_cum": 178946.35620222994, "throughput/token_count_per_second_update_recent": 310675.39347796835, "throughput/token_count_per_second_update_cum": 310489.9499851166, "throughput/batch_count_per_second_total_recent": 0.08457320265170953, "throughput/batch_count_per_second_total_cum": 0.08532827196227548, "throughput/batch_count_per_second_update_recent": 0.1481415717496721, "throughput/batch_count_per_second_update_cum": 0.14805314540153341, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 1593835520, "throughput/token_count": 1593835520, "throughput/batch_count": 760, "throughput/flop_count": 0, "throughput/total_time": 8857.199786913, "throughput/update_time": 5133.253916564281, "throughput/token_count_per_second_total_recent": 193929.06474965144, "throughput/token_count_per_second_total_cum": 179948.0149871949, "throughput/token_count_per_second_update_recent": 310671.48481786496, "throughput/token_count_per_second_update_cum": 310492.24252416566, "throughput/batch_count_per_second_total_recent": 0.09247258412821362, "throughput/batch_count_per_second_total_cum": 0.08580590009078737, "throughput/batch_count_per_second_update_recent": 0.14813970795529602, "throughput/batch_count_per_second_update_cum": 0.14805423856933864, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 1614807040, "throughput/token_count": 1614807040, "throughput/batch_count": 770, "throughput/flop_count": 0, "throughput/total_time": 9025.246275466983, "throughput/update_time": 5200.750415898627, "throughput/token_count_per_second_total_recent": 177295.53936829927, "throughput/token_count_per_second_total_cum": 178921.10538739254, "throughput/token_count_per_second_update_recent": 310675.00789981184, "throughput/token_count_per_second_update_cum": 310495.00761727686, "throughput/batch_count_per_second_total_recent": 0.0845411011544701, "throughput/batch_count_per_second_total_cum": 0.0853162314354861, "throughput/batch_count_per_second_update_recent": 0.1481413878916797, "throughput/batch_count_per_second_update_cum": 0.14805555706847995, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 1635778560, "throughput/token_count": 1635778560, "throughput/batch_count": 780, "throughput/flop_count": 0, "throughput/total_time": 9092.876169198076, "throughput/update_time": 5268.2632564986125, "throughput/token_count_per_second_total_recent": 193947.1087974449, "throughput/token_count_per_second_total_cum": 179896.71579837025, "throughput/token_count_per_second_update_recent": 310668.2453905363, "throughput/token_count_per_second_update_cum": 310496.7387463415, "throughput/batch_count_per_second_total_recent": 0.09248118820068593, "throughput/batch_count_per_second_total_cum": 0.08578143873137009, "throughput/batch_count_per_second_update_recent": 0.14813816327597443, "throughput/batch_count_per_second_update_cum": 0.1480563825351436, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 1656750080, "throughput/token_count": 1656750080, "throughput/batch_count": 790, "throughput/flop_count": 0, "throughput/total_time": 9260.972570382059, "throughput/update_time": 5335.757341830526, "throughput/token_count_per_second_total_recent": 177302.4127011823, "throughput/token_count_per_second_total_cum": 178895.90617064654, "throughput/token_count_per_second_update_recent": 310670.40572241263, "throughput/token_count_per_second_update_cum": 310499.5174746126, "throughput/batch_count_per_second_total_recent": 0.08454437861498942, "throughput/batch_count_per_second_total_cum": 0.08530421551258399, "throughput/batch_count_per_second_update_recent": 0.1481391934024871, "throughput/batch_count_per_second_update_cum": 0.14805770753603584, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 1677721600, "throughput/token_count": 1677721600, "throughput/batch_count": 800, "throughput/flop_count": 0, "throughput/total_time": 9328.580250757048, "throughput/update_time": 5403.266901573632, "throughput/token_count_per_second_total_recent": 193938.8215695372, "throughput/token_count_per_second_total_cum": 179847.4746319352, "throughput/token_count_per_second_update_recent": 310673.7629936403, "throughput/token_count_per_second_update_cum": 310501.33753551677, "throughput/batch_count_per_second_total_recent": 0.09247723654248104, "throughput/batch_count_per_second_total_cum": 0.0857579587135006, "throughput/batch_count_per_second_update_recent": 0.14814079427415863, "throughput/batch_count_per_second_update_cum": 0.14805857540870512, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 1698693120, "throughput/token_count": 1698693120, "throughput/batch_count": 810, "throughput/flop_count": 0, "throughput/total_time": 9497.46972536901, "throughput/update_time": 5470.770781763713, "throughput/token_count_per_second_total_recent": 177175.59151481115, "throughput/token_count_per_second_total_cum": 178857.43983606115, "throughput/token_count_per_second_update_recent": 310672.3254789631, "throughput/token_count_per_second_update_cum": 310503.4350301113, "throughput/batch_count_per_second_total_recent": 0.08448390556088026, "throughput/batch_count_per_second_total_cum": 0.0852858733349138, "throughput/batch_count_per_second_update_recent": 0.14814010881374506, "throughput/batch_count_per_second_update_cum": 0.14805957557206692, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 1719664640, "throughput/token_count": 1719664640, "throughput/batch_count": 820, "throughput/flop_count": 0, "throughput/total_time": 9565.069234885043, "throughput/update_time": 5538.269277713727, "throughput/token_count_per_second_total_recent": 193722.5863643154, "throughput/token_count_per_second_total_cum": 179785.906172865, "throughput/token_count_per_second_update_recent": 310673.14501311886, "throughput/token_count_per_second_update_cum": 310505.7832633773, "throughput/batch_count_per_second_total_recent": 0.0923741275617196, "throughput/batch_count_per_second_total_cum": 0.08572860058444262, "throughput/batch_count_per_second_update_recent": 0.14814049959808295, "throughput/batch_count_per_second_update_cum": 0.1480606952969443, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 1740636160, "throughput/token_count": 1740636160, "throughput/batch_count": 830, "throughput/flop_count": 0, "throughput/total_time": 9732.95014503505, "throughput/update_time": 5605.777275754837, "throughput/token_count_per_second_total_recent": 177147.6766807728, "throughput/token_count_per_second_total_cum": 178839.52286429095, "throughput/token_count_per_second_update_recent": 310671.8134239592, "throughput/token_count_per_second_update_cum": 310507.54861922644, "throughput/batch_count_per_second_total_recent": 0.0844705947307457, "throughput/batch_count_per_second_total_cum": 0.0852773298570113, "throughput/batch_count_per_second_update_recent": 0.14813986464689216, "throughput/batch_count_per_second_update_cum": 0.1480615370842106, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 1761607680, "throughput/token_count": 1761607680, "throughput/batch_count": 840, "throughput/flop_count": 0, "throughput/total_time": 9800.571015445981, "throughput/update_time": 5673.296408364549, "throughput/token_count_per_second_total_recent": 193706.19217507035, "throughput/token_count_per_second_total_cum": 179745.4125095013, "throughput/token_count_per_second_update_recent": 310667.85228024505, "throughput/token_count_per_second_update_cum": 310508.662548062, "throughput/batch_count_per_second_total_recent": 0.09236631020310895, "throughput/batch_count_per_second_total_cum": 0.08570929170107904, "throughput/batch_count_per_second_update_recent": 0.14813797582638027, "throughput/batch_count_per_second_update_cum": 0.148062068246871, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 1782579200, "throughput/token_count": 1782579200, "throughput/batch_count": 850, "throughput/flop_count": 0, "throughput/total_time": 9968.994457261055, "throughput/update_time": 5740.817306284327, "throughput/token_count_per_second_total_recent": 177052.20829689878, "throughput/token_count_per_second_total_cum": 178812.33735681674, "throughput/token_count_per_second_update_recent": 310657.5444127401, "throughput/token_count_per_second_update_cum": 310509.6547922986, "throughput/batch_count_per_second_total_recent": 0.08442507185788097, "throughput/batch_count_per_second_total_cum": 0.08526436679688298, "throughput/batch_count_per_second_update_recent": 0.14813306065213208, "throughput/batch_count_per_second_update_cum": 0.14806254138579303, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 1803550720, "throughput/token_count": 1803550720, "throughput/batch_count": 860, "throughput/flop_count": 0, "throughput/total_time": 10036.600267508999, "throughput/update_time": 5808.320603007567, "throughput/token_count_per_second_total_recent": 193639.0397797105, "throughput/token_count_per_second_total_cum": 179697.37480116126, "throughput/token_count_per_second_update_recent": 310656.28356854385, "throughput/token_count_per_second_update_cum": 310511.5649205238, "throughput/batch_count_per_second_total_recent": 0.09233428944573903, "throughput/batch_count_per_second_total_cum": 0.08568638553674758, "throughput/batch_count_per_second_update_recent": 0.1481324594347686, "throughput/batch_count_per_second_update_cum": 0.14806345220590772, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 1824522240, "throughput/token_count": 1824522240, "throughput/batch_count": 870, "throughput/flop_count": 0, "throughput/total_time": 10204.47946456098, "throughput/update_time": 5875.824670242728, "throughput/token_count_per_second_total_recent": 177078.5590007201, "throughput/token_count_per_second_total_cum": 178796.20869799017, "throughput/token_count_per_second_update_recent": 310655.63687883824, "throughput/token_count_per_second_update_cum": 310513.3904420313, "throughput/batch_count_per_second_total_recent": 0.08443763685260777, "throughput/batch_count_per_second_total_cum": 0.08525667605304249, "throughput/batch_count_per_second_update_recent": 0.14813215106908714, "throughput/batch_count_per_second_update_cum": 0.1480643226823956, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 1845493760, "throughput/token_count": 1845493760, "throughput/batch_count": 880, "throughput/flop_count": 0, "throughput/total_time": 10272.080307441996, "throughput/update_time": 5943.3299956357805, "throughput/token_count_per_second_total_recent": 193683.76398509278, "throughput/token_count_per_second_total_cum": 179661.14990971814, "throughput/token_count_per_second_update_recent": 310657.86897938786, "throughput/token_count_per_second_update_cum": 310515.1087614445, "throughput/batch_count_per_second_total_recent": 0.09235561560873641, "throughput/batch_count_per_second_total_cum": 0.08566911216245562, "throughput/batch_count_per_second_update_recent": 0.14813321541757005, "throughput/batch_count_per_second_update_cum": 0.14806514204094148, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 1866465280, "throughput/token_count": 1866465280, "throughput/batch_count": 890, "throughput/flop_count": 0, "throughput/total_time": 10440.040362586034, "throughput/update_time": 6010.818879780243, "throughput/token_count_per_second_total_recent": 177102.24174184696, "throughput/token_count_per_second_total_cum": 178779.50804566333, "throughput/token_count_per_second_update_recent": 310658.82952306047, "throughput/token_count_per_second_update_cum": 310517.63783443737, "throughput/batch_count_per_second_total_recent": 0.08444892966358516, "throughput/batch_count_per_second_total_cum": 0.08524871256144682, "throughput/batch_count_per_second_update_recent": 0.14813367344048523, "throughput/batch_count_per_second_update_cum": 0.14806634799692028, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 1887436800, "throughput/token_count": 1887436800, "throughput/batch_count": 900, "throughput/flop_count": 0, "throughput/total_time": 10507.649754095008, "throughput/update_time": 6078.3277447193395, "throughput/token_count_per_second_total_recent": 193850.2825186233, "throughput/token_count_per_second_total_cum": 179625.02026339754, "throughput/token_count_per_second_update_recent": 310659.3626293796, "throughput/token_count_per_second_update_cum": 310519.0899980255, "throughput/batch_count_per_second_total_recent": 0.09243501783305326, "throughput/batch_count_per_second_total_cum": 0.08565188420457723, "throughput/batch_count_per_second_update_recent": 0.14813392764538746, "throughput/batch_count_per_second_update_cum": 0.1480670404424789, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 1908408320, "throughput/token_count": 1908408320, "throughput/batch_count": 910, "throughput/flop_count": 0, "throughput/total_time": 10707.957879410009, "throughput/update_time": 6177.141915226472, "throughput/token_count_per_second_total_recent": 172480.3924732762, "throughput/token_count_per_second_total_cum": 178223.36821754012, "throughput/token_count_per_second_update_recent": 296757.88501307624, "throughput/token_count_per_second_update_cum": 308946.8149170784, "throughput/batch_count_per_second_total_recent": 0.0822450601927167, "throughput/batch_count_per_second_total_cum": 0.08498352442624098, "throughput/batch_count_per_second_update_recent": 0.14150518656400501, "throughput/batch_count_per_second_update_cum": 0.1473173212609665, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 1929379840, "throughput/token_count": 1929379840, "throughput/batch_count": 920, "throughput/flop_count": 0, "throughput/total_time": 10899.09726011497, "throughput/update_time": 6367.991133535514, "throughput/token_count_per_second_total_recent": 169208.92091212468, "throughput/token_count_per_second_total_cum": 177021.98576211694, "throughput/token_count_per_second_update_recent": 252277.8605732299, "throughput/token_count_per_second_update_cum": 302980.9243670864, "throughput/batch_count_per_second_total_recent": 0.08068510099035486, "throughput/batch_count_per_second_total_cum": 0.0844106606302819, "throughput/batch_count_per_second_update_recent": 0.12029545811330314, "throughput/batch_count_per_second_update_cum": 0.14447256296495742, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 1950351360, "throughput/token_count": 1950351360, "throughput/batch_count": 930, "throughput/flop_count": 0, "throughput/total_time": 11285.98193423904, "throughput/update_time": 6497.941741914721, "throughput/token_count_per_second_total_recent": 134270.64333414187, "throughput/token_count_per_second_total_cum": 172811.84493863917, "throughput/token_count_per_second_update_recent": 234487.31601383063, "throughput/token_count_per_second_update_cum": 300149.099124625, "throughput/batch_count_per_second_total_recent": 0.06402523199755758, "throughput/batch_count_per_second_total_cum": 0.0824031090443798, "throughput/batch_count_per_second_update_recent": 0.11181226540271312, "throughput/batch_count_per_second_update_cum": 0.14312224346381425, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 1971322880, "throughput/token_count": 1971322880, "throughput/batch_count": 940, "throughput/flop_count": 0, "throughput/total_time": 11471.464317596983, "throughput/update_time": 6683.226909449557, "throughput/token_count_per_second_total_recent": 132805.3207573242, "throughput/token_count_per_second_total_cum": 171845.79277956978, "throughput/token_count_per_second_update_recent": 206960.32855764765, "throughput/token_count_per_second_update_cum": 294965.7263937432, "throughput/batch_count_per_second_total_recent": 0.06332651174417696, "throughput/batch_count_per_second_total_cum": 0.08194245947817315, "throughput/batch_count_per_second_update_recent": 0.09868637493021376, "throughput/batch_count_per_second_update_cum": 0.140650618740913, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 1992294400, "throughput/token_count": 1992294400, "throughput/batch_count": 950, "throughput/flop_count": 0, "throughput/total_time": 11871.571419561049, "throughput/update_time": 6813.441040770616, "throughput/token_count_per_second_total_recent": 109514.30539890044, "throughput/token_count_per_second_total_cum": 167820.61359772919, "throughput/token_count_per_second_update_recent": 194786.8313983746, "throughput/token_count_per_second_update_cum": 292406.49300087977, "throughput/batch_count_per_second_total_recent": 0.052220490169000836, "throughput/batch_count_per_second_total_cum": 0.0800231044758459, "throughput/batch_count_per_second_update_recent": 0.09288159913939219, "throughput/batch_count_per_second_update_cum": 0.13943028116268147, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 2013265920, "throughput/token_count": 2013265920, "throughput/batch_count": 960, "throughput/flop_count": 0, "throughput/total_time": 12034.565373313962, "throughput/update_time": 6976.259414628497, "throughput/token_count_per_second_total_recent": 109797.21950837731, "throughput/token_count_per_second_total_cum": 167290.28905890652, "throughput/token_count_per_second_update_recent": 178797.97892805707, "throughput/token_count_per_second_update_cum": 288588.1674323046, "throughput/batch_count_per_second_total_recent": 0.052355394128979355, "throughput/batch_count_per_second_total_cum": 0.07977022602982832, "throughput/batch_count_per_second_update_recent": 0.08525752016451696, "throughput/batch_count_per_second_update_cum": 0.13760956164946775, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 2034237440, "throughput/token_count": 2034237440, "throughput/batch_count": 970, "throughput/flop_count": 0, "throughput/total_time": 12449.846788060968, "throughput/update_time": 7106.677357654553, "throughput/token_count_per_second_total_recent": 92744.4150894768, "throughput/token_count_per_second_total_cum": 163394.5762248876, "throughput/token_count_per_second_update_recent": 169608.03032163144, "throughput/token_count_per_second_update_cum": 286243.1116010827, "throughput/batch_count_per_second_total_recent": 0.04422398333047714, "throughput/batch_count_per_second_total_cum": 0.0779126053928793, "throughput/batch_count_per_second_update_recent": 0.08087541118699619, "throughput/batch_count_per_second_update_cum": 0.13649135189108025, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} +{"step": 2055208960, "throughput/token_count": 2055208960, "throughput/batch_count": 980, "throughput/flop_count": 0, "throughput/total_time": 12586.090243482962, "throughput/update_time": 7242.765970333596, "throughput/token_count_per_second_total_recent": 94077.61034940367, "throughput/token_count_per_second_total_cum": 163292.08834841946, "throughput/token_count_per_second_update_recent": 160609.6338906823, "throughput/token_count_per_second_update_cum": 283760.2330957739, "throughput/batch_count_per_second_total_recent": 0.044859700369550545, "throughput/batch_count_per_second_total_cum": 0.07786373536511396, "throughput/batch_count_per_second_update_recent": 0.07658464140447727, "throughput/batch_count_per_second_update_cum": 0.13530742316044517, "throughput/flop_count_per_second_total_recent": 0.0, "throughput/mfu_total_recent": 0.0, "throughput/flop_count_per_second_total_cum": 0.0, "throughput/mfu_total_cum": 0.0, "throughput/flop_count_per_second_update_recent": 0.0, "throughput/mfu_update_recent": 0.0, "throughput/flop_count_per_second_update_cum": 0.0, "throughput/mfu_update_cum": 0.0} diff --git a/metrics/jsonlines/train.jsonl b/metrics/jsonlines/train.jsonl index de016af6e16a79b6fdefb50f9b73de335cd5c40a..e61293a948a42a027afacd6c7f2d9ba5e3acacbe 100644 --- a/metrics/jsonlines/train.jsonl +++ b/metrics/jsonlines/train.jsonl @@ -1,98 +1,98 @@ -{"step": 20971520, "train/token_count": 20971520, "train/batch_count": 10, "train/flop_count": 0, "train/total_time": 59.67119211301906, "train/update_time": 59.47943267202936, "train/lr": 0.0009000000000000001, "train/loss": 9.761818885803223, "train/global_grad_norm": 1.2346482276916504} -{"step": 41943040, "train/token_count": 41943040, "train/batch_count": 20, "train/flop_count": 0, "train/total_time": 115.56162341398885, "train/update_time": 115.25218190002488, "train/lr": 0.0009997960964140947, "train/loss": 8.126625061035156, "train/global_grad_norm": 0.962837278842926} -{"step": 62914560, "train/token_count": 62914560, "train/batch_count": 30, "train/flop_count": 0, "train/total_time": 261.80084426596295, "train/update_time": 171.00324709707638, "train/lr": 0.0009990914580222257, "train/loss": 7.519778728485107, "train/global_grad_norm": 0.5695855021476746} -{"step": 83886080, "train/token_count": 83886080, "train/batch_count": 40, "train/flop_count": 0, "train/total_time": 317.6585605319706, "train/update_time": 226.74485654599266, "train/lr": 0.0009978842768382998, "train/loss": 7.193304061889648, "train/global_grad_norm": 0.4217643439769745} -{"step": 104857600, "train/token_count": 104857600, "train/batch_count": 50, "train/flop_count": 0, "train/total_time": 464.0044742010068, "train/update_time": 282.4727703850367, "train/lr": 0.0009961757683914405, "train/loss": 6.9472150802612305, "train/global_grad_norm": 0.26760002970695496} -{"step": 125829120, "train/token_count": 125829120, "train/batch_count": 60, "train/flop_count": 0, "train/total_time": 519.8498190940008, "train/update_time": 338.1969051870401, "train/lr": 0.00099396765300483, "train/loss": 6.68041467666626, "train/global_grad_norm": 0.31579363346099854} -{"step": 146800640, "train/token_count": 146800640, "train/batch_count": 70, "train/flop_count": 0, "train/total_time": 665.9225179139758, "train/update_time": 393.9277793511865, "train/lr": 0.0009912621540634887, "train/loss": 6.480125904083252, "train/global_grad_norm": 0.26012396812438965} -{"step": 167772160, "train/token_count": 167772160, "train/batch_count": 80, "train/flop_count": 0, "train/total_time": 721.7491259319941, "train/update_time": 449.6537483881111, "train/lr": 0.000988061995775515, "train/loss": 6.281551837921143, "train/global_grad_norm": 0.39679110050201416} -{"step": 188743680, "train/token_count": 188743680, "train/batch_count": 90, "train/flop_count": 0, "train/total_time": 867.7238940689713, "train/update_time": 505.3731428361498, "train/lr": 0.0009843704004290394, "train/loss": 6.122912406921387, "train/global_grad_norm": 1.23171067237854} -{"step": 209715200, "train/token_count": 209715200, "train/batch_count": 100, "train/flop_count": 0, "train/total_time": 923.5610087590176, "train/update_time": 561.0979154942906, "train/lr": 0.0009801910851476522, "train/loss": 5.9722723960876465, "train/global_grad_norm": 0.3574962913990021} -{"step": 230686720, "train/token_count": 230686720, "train/batch_count": 110, "train/flop_count": 0, "train/total_time": 1070.0474769069697, "train/update_time": 616.8178668163018, "train/lr": 0.0009755282581475768, "train/loss": 5.849911212921143, "train/global_grad_norm": 0.38126564025878906} -{"step": 251658240, "train/token_count": 251658240, "train/batch_count": 120, "train/flop_count": 0, "train/total_time": 1125.8911167309852, "train/update_time": 672.5490376223461, "train/lr": 0.0009703866145003512, "train/loss": 5.7178874015808105, "train/global_grad_norm": 0.6952179670333862} -{"step": 272629760, "train/token_count": 272629760, "train/batch_count": 130, "train/flop_count": 0, "train/total_time": 1271.9843903059955, "train/update_time": 728.276082833414, "train/lr": 0.0009647713314052896, "train/loss": 5.644232749938965, "train/global_grad_norm": 0.34717857837677} -{"step": 293601280, "train/token_count": 293601280, "train/batch_count": 140, "train/flop_count": 0, "train/total_time": 1327.8194525539875, "train/update_time": 784.0098267712165, "train/lr": 0.0009586880629764817, "train/loss": 5.570384502410889, "train/global_grad_norm": 0.6765910983085632} -{"step": 314572800, "train/token_count": 314572800, "train/batch_count": 150, "train/flop_count": 0, "train/total_time": 1473.913084711996, "train/update_time": 839.7419353383593, "train/lr": 0.0009521429345495787, "train/loss": 5.444611072540283, "train/global_grad_norm": 0.4169935882091522} -{"step": 335544320, "train/token_count": 335544320, "train/batch_count": 160, "train/flop_count": 0, "train/total_time": 1529.7566971820197, "train/update_time": 895.4798201125232, "train/lr": 0.0009451425365140996, "train/loss": 5.40510368347168, "train/global_grad_norm": 0.709697961807251} -{"step": 356515840, "train/token_count": 356515840, "train/batch_count": 170, "train/flop_count": 0, "train/total_time": 1675.8973924720194, "train/update_time": 951.2102684524143, "train/lr": 0.000937693917677468, "train/loss": 5.298379421234131, "train/global_grad_norm": 0.35993462800979614} -{"step": 377487360, "train/token_count": 377487360, "train/batch_count": 180, "train/flop_count": 0, "train/total_time": 1731.7409806579817, "train/update_time": 1006.941321704362, "train/lr": 0.0009298045781674596, "train/loss": 5.267183303833008, "train/global_grad_norm": 0.45855849981307983} -{"step": 398458880, "train/token_count": 398458880, "train/batch_count": 190, "train/flop_count": 0, "train/total_time": 1877.79561357497, "train/update_time": 1062.6691502483445, "train/lr": 0.0009214824618802108, "train/loss": 5.240725994110107, "train/global_grad_norm": 0.45877301692962646} -{"step": 419430400, "train/token_count": 419430400, "train/batch_count": 200, "train/flop_count": 0, "train/total_time": 1933.6392927800189, "train/update_time": 1118.3960400532233, "train/lr": 0.000912735948481387, "train/loss": 5.148595809936523, "train/global_grad_norm": 0.5232999920845032} -{"step": 440401920, "train/token_count": 440401920, "train/batch_count": 210, "train/flop_count": 0, "train/total_time": 2080.2760781620163, "train/update_time": 1174.1330189242726, "train/lr": 0.0009035738449685707, "train/loss": 5.102267742156982, "train/global_grad_norm": 0.40673965215682983} -{"step": 461373440, "train/token_count": 461373440, "train/batch_count": 220, "train/flop_count": 0, "train/total_time": 2136.1286090469803, "train/update_time": 1229.8719967252691, "train/lr": 0.0008940053768033609, "train/loss": 5.072765827178955, "train/global_grad_norm": 0.540256679058075} -{"step": 482344960, "train/token_count": 482344960, "train/batch_count": 230, "train/flop_count": 0, "train/total_time": 2282.3240791389835, "train/update_time": 1285.6013174692634, "train/lr": 0.0008840401786221159, "train/loss": 5.013406276702881, "train/global_grad_norm": 0.4202441871166229} -{"step": 503316480, "train/token_count": 503316480, "train/batch_count": 240, "train/flop_count": 0, "train/total_time": 2338.156839519972, "train/update_time": 1341.322897736216, "train/lr": 0.0008736882845346905, "train/loss": 4.965211868286133, "train/global_grad_norm": 0.5850781798362732} -{"step": 524288000, "train/token_count": 524288000, "train/batch_count": 250, "train/flop_count": 0, "train/total_time": 2484.2231900609913, "train/update_time": 1397.0756743992679, "train/lr": 0.0008629601180209381, "train/loss": 4.961833477020264, "train/global_grad_norm": 0.6340895295143127} -{"step": 545259520, "train/token_count": 545259520, "train/batch_count": 260, "train/flop_count": 0, "train/total_time": 2540.0900395850185, "train/update_time": 1452.8315300212707, "train/lr": 0.0008518664814351503, "train/loss": 4.912302017211914, "train/global_grad_norm": 0.5044277310371399} -{"step": 566231040, "train/token_count": 566231040, "train/batch_count": 270, "train/flop_count": 0, "train/total_time": 2686.6876707019983, "train/update_time": 1508.5666796893347, "train/lr": 0.0008404185451290017, "train/loss": 4.897612571716309, "train/global_grad_norm": 0.4688912034034729} -{"step": 587202560, "train/token_count": 587202560, "train/batch_count": 280, "train/flop_count": 0, "train/total_time": 2742.5622889249935, "train/update_time": 1564.3145829213317, "train/lr": 0.0008286278362039527, "train/loss": 4.848834037780762, "train/global_grad_norm": 0.6365319490432739} -{"step": 608174080, "train/token_count": 608174080, "train/batch_count": 290, "train/flop_count": 0, "train/total_time": 2889.1227126879967, "train/update_time": 1620.060068657389, "train/lr": 0.0008165062269044352, "train/loss": 4.8169732093811035, "train/global_grad_norm": 0.4134746789932251} -{"step": 629145600, "train/token_count": 629145600, "train/batch_count": 300, "train/flop_count": 0, "train/total_time": 2944.979424642981, "train/update_time": 1675.8085315313656, "train/lr": 0.0008040659226635089, "train/loss": 4.79654598236084, "train/global_grad_norm": 0.5643511414527893} -{"step": 650117120, "train/token_count": 650117120, "train/batch_count": 310, "train/flop_count": 0, "train/total_time": 3091.6305087410146, "train/update_time": 1731.5482600294054, "train/lr": 0.0007913194498130252, "train/loss": 4.810868740081787, "train/global_grad_norm": 0.47013285756111145} -{"step": 671088640, "train/token_count": 671088640, "train/batch_count": 320, "train/flop_count": 0, "train/total_time": 3147.4849796229973, "train/update_time": 1787.300771905575, "train/lr": 0.000778279642970672, "train/loss": 4.74250602722168, "train/global_grad_norm": 0.5142323970794678} -{"step": 692060160, "train/token_count": 692060160, "train/batch_count": 330, "train/flop_count": 0, "train/total_time": 3293.557014766964, "train/update_time": 1843.0341175765498, "train/lr": 0.0007649596321166025, "train/loss": 4.759753704071045, "train/global_grad_norm": 0.5028547644615173} -{"step": 713031680, "train/token_count": 713031680, "train/batch_count": 340, "train/flop_count": 0, "train/total_time": 3349.424384585989, "train/update_time": 1898.7890577405924, "train/lr": 0.0007513728293726579, "train/loss": 4.724730491638184, "train/global_grad_norm": 0.5188063383102417} -{"step": 734003200, "train/token_count": 734003200, "train/batch_count": 350, "train/flop_count": 0, "train/total_time": 3496.0525263110176, "train/update_time": 1954.5468697096221, "train/lr": 0.0007375329154974975, "train/loss": 4.704092502593994, "train/global_grad_norm": 0.4179239571094513} -{"step": 754974720, "train/token_count": 754974720, "train/batch_count": 360, "train/flop_count": 0, "train/total_time": 3551.9233703140053, "train/update_time": 2010.2989615525585, "train/lr": 0.0007234538261112341, "train/loss": 4.630825042724609, "train/global_grad_norm": 0.4399227201938629} -{"step": 775946240, "train/token_count": 775946240, "train/batch_count": 370, "train/flop_count": 0, "train/total_time": 3698.38593041501, "train/update_time": 2066.043420936505, "train/lr": 0.0007091497376634464, "train/loss": 4.655548095703125, "train/global_grad_norm": 0.45650508999824524} -{"step": 796917760, "train/token_count": 796917760, "train/batch_count": 380, "train/flop_count": 0, "train/total_time": 3754.2459046120057, "train/update_time": 2121.7969342375873, "train/lr": 0.0006946350531586958, "train/loss": 4.63443660736084, "train/global_grad_norm": 0.4673406481742859} -{"step": 817889280, "train/token_count": 817889280, "train/batch_count": 390, "train/flop_count": 0, "train/total_time": 3902.067099667969, "train/update_time": 2177.5441309445887, "train/lr": 0.0006799243876539214, "train/loss": 4.639521598815918, "train/global_grad_norm": 0.5377744436264038} -{"step": 838860800, "train/token_count": 838860800, "train/batch_count": 400, "train/flop_count": 0, "train/total_time": 3957.9235017200117, "train/update_time": 2233.291398033558, "train/lr": 0.0006650325535423166, "train/loss": 4.547835826873779, "train/global_grad_norm": 0.5047109127044678} -{"step": 859832320, "train/token_count": 859832320, "train/batch_count": 410, "train/flop_count": 0, "train/total_time": 4104.85487911501, "train/update_time": 2289.03639949864, "train/lr": 0.0006499745456385053, "train/loss": 4.572357654571533, "train/global_grad_norm": 0.6879011392593384} -{"step": 880803840, "train/token_count": 880803840, "train/batch_count": 420, "train/flop_count": 0, "train/total_time": 4160.708882857987, "train/update_time": 2344.7830602055765, "train/lr": 0.0006347655260800339, "train/loss": 4.565418720245361, "train/global_grad_norm": 0.428315132856369} -{"step": 901775360, "train/token_count": 901775360, "train/batch_count": 430, "train/flop_count": 0, "train/total_time": 4307.030373459973, "train/update_time": 2400.5389171724673, "train/lr": 0.0006194208090603844, "train/loss": 4.560233116149902, "train/global_grad_norm": 0.45447441935539246} -{"step": 922746880, "train/token_count": 922746880, "train/batch_count": 440, "train/flop_count": 0, "train/total_time": 4362.870210377965, "train/update_time": 2456.286583611334, "train/lr": 0.0006039558454088796, "train/loss": 4.5870771408081055, "train/global_grad_norm": 0.7089611887931824} -{"step": 943718400, "train/token_count": 943718400, "train/batch_count": 450, "train/flop_count": 0, "train/total_time": 4509.209549701016, "train/update_time": 2512.043382478296, "train/lr": 0.0005883862070330078, "train/loss": 4.5283427238464355, "train/global_grad_norm": 0.4208521842956543} -{"step": 964689920, "train/token_count": 964689920, "train/batch_count": 460, "train/flop_count": 0, "train/total_time": 4565.060955744993, "train/update_time": 2567.792047406314, "train/lr": 0.0005727275712388317, "train/loss": 4.496908187866211, "train/global_grad_norm": 0.6397818922996521} -{"step": 985661440, "train/token_count": 985661440, "train/batch_count": 470, "train/flop_count": 0, "train/total_time": 4711.463092761987, "train/update_time": 2623.54389296734, "train/lr": 0.0005569957049452703, "train/loss": 4.518903732299805, "train/global_grad_norm": 0.5339348316192627} -{"step": 1006632960, "train/token_count": 1006632960, "train/batch_count": 480, "train/flop_count": 0, "train/total_time": 4767.3171031199745, "train/update_time": 2679.296893617313, "train/lr": 0.0005412064488081482, "train/loss": 4.495401382446289, "train/global_grad_norm": 0.47157326340675354} -{"step": 1027604480, "train/token_count": 1027604480, "train/batch_count": 490, "train/flop_count": 0, "train/total_time": 4913.884267903981, "train/update_time": 2735.0481502541807, "train/lr": 0.0005253757012699972, "train/loss": 4.490736484527588, "train/global_grad_norm": 0.5239655375480652} -{"step": 1048576000, "train/token_count": 1048576000, "train/batch_count": 500, "train/flop_count": 0, "train/total_time": 4969.727964510967, "train/update_time": 2790.797476610227, "train/lr": 0.0005095194025516734, "train/loss": 4.4643659591674805, "train/global_grad_norm": 0.5247243642807007} -{"step": 1069547520, "train/token_count": 1069547520, "train/batch_count": 510, "train/flop_count": 0, "train/total_time": 5116.911609567003, "train/update_time": 2846.5413487541373, "train/lr": 0.0004936535186019053, "train/loss": 4.463287353515625, "train/global_grad_norm": 0.4336317479610443} -{"step": 1090519040, "train/token_count": 1090519040, "train/batch_count": 520, "train/flop_count": 0, "train/total_time": 5172.765950003988, "train/update_time": 2902.285505968146, "train/lr": 0.00047779402502093696, "train/loss": 4.457107067108154, "train/global_grad_norm": 0.6947441101074219} -{"step": 1111490560, "train/token_count": 1111490560, "train/batch_count": 530, "train/flop_count": 0, "train/total_time": 5319.3806196419755, "train/update_time": 2958.0443075241055, "train/lr": 0.0004619568909744525, "train/loss": 4.4143757820129395, "train/global_grad_norm": 0.45258453488349915} -{"step": 1132462080, "train/token_count": 1132462080, "train/batch_count": 540, "train/flop_count": 0, "train/total_time": 5375.2188087760005, "train/update_time": 3013.786068893096, "train/lr": 0.00044615806311398067, "train/loss": 4.424180030822754, "train/global_grad_norm": 0.4154273271560669} -{"step": 1153433600, "train/token_count": 1153433600, "train/batch_count": 550, "train/flop_count": 0, "train/total_time": 5521.543946499005, "train/update_time": 3069.52293490601, "train/lr": 0.0004304134495199673, "train/loss": 4.3700270652771, "train/global_grad_norm": 0.3898273706436157} -{"step": 1174405120, "train/token_count": 1174405120, "train/batch_count": 560, "train/flop_count": 0, "train/total_time": 5577.396705480001, "train/update_time": 3125.264993761957, "train/lr": 0.0004147389036836882, "train/loss": 4.413632869720459, "train/global_grad_norm": 0.5425747036933899} -{"step": 1195376640, "train/token_count": 1195376640, "train/batch_count": 570, "train/flop_count": 0, "train/total_time": 5723.710059185978, "train/update_time": 3181.0052098479937, "train/lr": 0.0003991502085441259, "train/loss": 4.3622026443481445, "train/global_grad_norm": 0.45439326763153076} -{"step": 1216348160, "train/token_count": 1216348160, "train/batch_count": 580, "train/flop_count": 0, "train/total_time": 5779.562335913011, "train/update_time": 3236.749040101946, "train/lr": 0.0003836630605958888, "train/loss": 4.410221576690674, "train/global_grad_norm": 0.4280547499656677} -{"step": 1237319680, "train/token_count": 1237319680, "train/batch_count": 590, "train/flop_count": 0, "train/total_time": 5925.651880743972, "train/update_time": 3292.4961628898745, "train/lr": 0.00036829305408417155, "train/loss": 4.391324520111084, "train/global_grad_norm": 0.42996275424957275} -{"step": 1258291200, "train/token_count": 1258291200, "train/batch_count": 600, "train/flop_count": 0, "train/total_time": 5981.512405745976, "train/update_time": 3348.2465934828506, "train/lr": 0.000353055665302672, "train/loss": 4.390552997589111, "train/global_grad_norm": 0.6177342534065247} -{"step": 1279262720, "train/token_count": 1279262720, "train/batch_count": 610, "train/flop_count": 0, "train/total_time": 6128.563732119976, "train/update_time": 3403.9983463209355, "train/lr": 0.0003379662370102746, "train/loss": 4.355296611785889, "train/global_grad_norm": 0.445901095867157} -{"step": 1300234240, "train/token_count": 1300234240, "train/batch_count": 620, "train/flop_count": 0, "train/total_time": 6184.421642497007, "train/update_time": 3459.7488345169113, "train/lr": 0.00032303996298219405, "train/loss": 4.329927444458008, "train/global_grad_norm": 0.4848615527153015} -{"step": 1321205760, "train/token_count": 1321205760, "train/batch_count": 630, "train/flop_count": 0, "train/total_time": 6330.95437408, "train/update_time": 3515.502175346832, "train/lr": 0.00030829187271113034, "train/loss": 4.3402838706970215, "train/global_grad_norm": 0.42915236949920654} -{"step": 1342177280, "train/token_count": 1342177280, "train/batch_count": 640, "train/flop_count": 0, "train/total_time": 6386.8215373010025, "train/update_time": 3571.2574781817966, "train/lr": 0.0002937368162738445, "train/loss": 4.330328464508057, "train/global_grad_norm": 0.44172123074531555} -{"step": 1363148800, "train/token_count": 1363148800, "train/batch_count": 650, "train/flop_count": 0, "train/total_time": 6532.972345999966, "train/update_time": 3627.0106727198, "train/lr": 0.0002793894493783894, "train/loss": 4.3035969734191895, "train/global_grad_norm": 0.4424532651901245} -{"step": 1384120320, "train/token_count": 1384120320, "train/batch_count": 660, "train/flop_count": 0, "train/total_time": 6588.8636812510085, "train/update_time": 3682.7761907348176, "train/lr": 0.00026526421860705474, "train/loss": 4.325634956359863, "train/global_grad_norm": 0.4446793496608734} -{"step": 1405091840, "train/token_count": 1405091840, "train/batch_count": 670, "train/flop_count": 0, "train/total_time": 6734.952371816966, "train/update_time": 3738.5206979417126, "train/lr": 0.0002513753468698824, "train/loss": 4.269580841064453, "train/global_grad_norm": 0.4529637098312378} -{"step": 1426063360, "train/token_count": 1426063360, "train/batch_count": 680, "train/flop_count": 0, "train/total_time": 6790.794109267998, "train/update_time": 3794.261904676736, "train/lr": 0.00023773681908340283, "train/loss": 4.283663749694824, "train/global_grad_norm": 0.445527583360672} -{"step": 1447034880, "train/token_count": 1447034880, "train/batch_count": 690, "train/flop_count": 0, "train/total_time": 6936.7826500849915, "train/update_time": 3850.002778201713, "train/lr": 0.00022436236808900823, "train/loss": 4.284794807434082, "train/global_grad_norm": 0.37836042046546936} -{"step": 1468006400, "train/token_count": 1468006400, "train/batch_count": 700, "train/flop_count": 0, "train/total_time": 6992.639220207988, "train/update_time": 3905.7538149688044, "train/lr": 0.00021126546082514682, "train/loss": 4.279749870300293, "train/global_grad_norm": 0.3362836241722107} -{"step": 1488977920, "train/token_count": 1488977920, "train/batch_count": 710, "train/flop_count": 0, "train/total_time": 7139.175099594984, "train/update_time": 3961.498215056723, "train/lr": 0.00019845928476725522, "train/loss": 4.276471138000488, "train/global_grad_norm": 0.3601376414299011} -{"step": 1509949440, "train/token_count": 1509949440, "train/batch_count": 720, "train/flop_count": 0, "train/total_time": 7195.047693797969, "train/update_time": 4017.255656591733, "train/lr": 0.0001859567346490913, "train/loss": 4.2520365715026855, "train/global_grad_norm": 0.3764491081237793} -{"step": 1530920960, "train/token_count": 1530920960, "train/batch_count": 730, "train/flop_count": 0, "train/total_time": 7341.3063276839675, "train/update_time": 4072.9893345796154, "train/lr": 0.00017377039947882782, "train/loss": 4.269729137420654, "train/global_grad_norm": 0.3962520360946655} -{"step": 1551892480, "train/token_count": 1551892480, "train/batch_count": 740, "train/flop_count": 0, "train/total_time": 7397.135954166006, "train/update_time": 4128.71436746855, "train/lr": 0.00016191254986299043, "train/loss": 4.254550933837891, "train/global_grad_norm": 0.357697457075119} -{"step": 1572864000, "train/token_count": 1572864000, "train/batch_count": 750, "train/flop_count": 0, "train/total_time": 7543.42272252898, "train/update_time": 4184.450604122656, "train/lr": 0.00015039512565099468, "train/loss": 4.237186431884766, "train/global_grad_norm": 0.34904253482818604} -{"step": 1593835520, "train/token_count": 1593835520, "train/batch_count": 760, "train/flop_count": 0, "train/total_time": 7599.255373338994, "train/update_time": 4240.1828456086805, "train/lr": 0.00013922972391273224, "train/loss": 4.198566436767578, "train/global_grad_norm": 0.3618724048137665} -{"step": 1614807040, "train/token_count": 1614807040, "train/batch_count": 770, "train/flop_count": 0, "train/total_time": 7745.557092848001, "train/update_time": 4295.924490743666, "train/lr": 0.00012842758726130281, "train/loss": 4.263113975524902, "train/global_grad_norm": 0.3145442306995392} -{"step": 1635778560, "train/token_count": 1635778560, "train/batch_count": 780, "train/flop_count": 0, "train/total_time": 7801.41744280397, "train/update_time": 4351.678691691719, "train/lr": 0.00011799959253265679, "train/loss": 4.1848530769348145, "train/global_grad_norm": 0.3598962128162384} -{"step": 1656750080, "train/token_count": 1656750080, "train/batch_count": 790, "train/flop_count": 0, "train/total_time": 7947.527445982967, "train/update_time": 4407.426374787698, "train/lr": 0.00010795623983354214, "train/loss": 4.2140374183654785, "train/global_grad_norm": 0.3123509883880615} -{"step": 1677721600, "train/token_count": 1677721600, "train/batch_count": 800, "train/flop_count": 0, "train/total_time": 8003.3749305050005, "train/update_time": 4463.172267011658, "train/lr": 9.830764196878872e-05, "train/loss": 4.1917405128479, "train/global_grad_norm": 0.31881648302078247} -{"step": 1698693120, "train/token_count": 1698693120, "train/batch_count": 810, "train/flop_count": 0, "train/total_time": 8150.418857850018, "train/update_time": 4518.926882733707, "train/lr": 8.906351425856951e-05, "train/loss": 4.167685508728027, "train/global_grad_norm": 0.29552316665649414} -{"step": 1719664640, "train/token_count": 1719664640, "train/batch_count": 820, "train/flop_count": 0, "train/total_time": 8206.275687849964, "train/update_time": 4574.681749307667, "train/lr": 8.02331647558977e-05, "train/loss": 4.179322242736816, "train/global_grad_norm": 0.281093567609787} -{"step": 1740636160, "train/token_count": 1740636160, "train/batch_count": 830, "train/flop_count": 0, "train/total_time": 8352.533770180016, "train/update_time": 4630.437151424645, "train/lr": 7.182548487420554e-05, "train/loss": 4.211834907531738, "train/global_grad_norm": 0.29659828543663025} -{"step": 1761607680, "train/token_count": 1761607680, "train/batch_count": 840, "train/flop_count": 0, "train/total_time": 8408.397738418018, "train/update_time": 4686.193155970657, "train/lr": 6.384894043444556e-05, "train/loss": 4.1608757972717285, "train/global_grad_norm": 0.29815351963043213} -{"step": 1782579200, "train/token_count": 1782579200, "train/batch_count": 850, "train/flop_count": 0, "train/total_time": 8554.563296968001, "train/update_time": 4741.9453661507, "train/lr": 5.6311563140726166e-05, "train/loss": 4.230018138885498, "train/global_grad_norm": 0.2653578221797943} -{"step": 1803550720, "train/token_count": 1803550720, "train/batch_count": 860, "train/flop_count": 0, "train/total_time": 8610.412180389976, "train/update_time": 4797.692331016588, "train/lr": 4.922094249306547e-05, "train/loss": 4.209297180175781, "train/global_grad_norm": 0.2605638802051544} -{"step": 1824522240, "train/token_count": 1824522240, "train/batch_count": 870, "train/flop_count": 0, "train/total_time": 8756.645623472985, "train/update_time": 4853.441611350689, "train/lr": 4.2584218145409916e-05, "train/loss": 4.1548752784729, "train/global_grad_norm": 0.2570478022098541} -{"step": 1845493760, "train/token_count": 1845493760, "train/batch_count": 880, "train/flop_count": 0, "train/total_time": 8812.488870778994, "train/update_time": 4909.191867132671, "train/lr": 3.6408072716606236e-05, "train/loss": 4.172904968261719, "train/global_grad_norm": 0.2740459740161896} -{"step": 1866465280, "train/token_count": 1866465280, "train/batch_count": 890, "train/flop_count": 0, "train/total_time": 8958.696731061966, "train/update_time": 4964.9637988246395, "train/lr": 3.069872506157217e-05, "train/loss": 4.228043079376221, "train/global_grad_norm": 0.25757673382759094} -{"step": 1887436800, "train/token_count": 1887436800, "train/batch_count": 900, "train/flop_count": 0, "train/total_time": 9014.539005896018, "train/update_time": 5020.712582220614, "train/lr": 2.5461924009435368e-05, "train/loss": 4.143199920654297, "train/global_grad_norm": 0.2552241086959839} -{"step": 1908408320, "train/token_count": 1908408320, "train/batch_count": 910, "train/flop_count": 0, "train/total_time": 9161.434383540007, "train/update_time": 5076.4567962596775, "train/lr": 2.0702942574950812e-05, "train/loss": 4.177771091461182, "train/global_grad_norm": 0.24890665709972382} -{"step": 1929379840, "train/token_count": 1929379840, "train/batch_count": 920, "train/flop_count": 0, "train/total_time": 9217.298647498013, "train/update_time": 5132.212492840539, "train/lr": 1.642657264902142e-05, "train/loss": 4.206305027008057, "train/global_grad_norm": 0.23305842280387878} -{"step": 1950351360, "train/token_count": 1950351360, "train/batch_count": 930, "train/flop_count": 0, "train/total_time": 9363.553384587984, "train/update_time": 5187.936918072519, "train/lr": 1.2637120173670358e-05, "train/loss": 4.190739154815674, "train/global_grad_norm": 0.22044338285923004} -{"step": 1971322880, "train/token_count": 1971322880, "train/batch_count": 940, "train/flop_count": 0, "train/total_time": 9419.395047847007, "train/update_time": 5243.674068749533, "train/lr": 9.338400806321978e-06, "train/loss": 4.147926330566406, "train/global_grad_norm": 0.22512836754322052} -{"step": 1992294400, "train/token_count": 1992294400, "train/batch_count": 950, "train/flop_count": 0, "train/total_time": 9565.561509469, "train/update_time": 5299.418035702605, "train/lr": 6.533736077758867e-06, "train/loss": 4.170260429382324, "train/global_grad_norm": 0.22401364147663116} -{"step": 2013265920, "train/token_count": 2013265920, "train/batch_count": 960, "train/flop_count": 0, "train/total_time": 9621.420487532974, "train/update_time": 5355.161261588568, "train/lr": 4.2259500476214406e-06, "train/loss": 4.168946266174316, "train/global_grad_norm": 0.215094193816185} -{"step": 2034237440, "train/token_count": 2034237440, "train/batch_count": 970, "train/flop_count": 0, "train/total_time": 9767.543393198983, "train/update_time": 5410.899587089545, "train/lr": 2.417366460819359e-06, "train/loss": 4.192867755889893, "train/global_grad_norm": 0.21194864809513092} -{"step": 2055208960, "train/token_count": 2055208960, "train/batch_count": 980, "train/flop_count": 0, "train/total_time": 9823.39123856998, "train/update_time": 5466.646158660587, "train/lr": 1.1098064077174619e-06, "train/loss": 4.168134689331055, "train/global_grad_norm": 0.20849043130874634} +{"step": 20971520, "train/token_count": 20971520, "train/batch_count": 10, "train/flop_count": 0, "train/total_time": 69.9843489350751, "train/update_time": 69.80232589994557, "train/lr": 0.0009000000000000001, "train/loss": 9.773597717285156, "train/global_grad_norm": 1.234387755393982} +{"step": 41943040, "train/token_count": 41943040, "train/batch_count": 20, "train/flop_count": 0, "train/total_time": 137.6325364280492, "train/update_time": 137.34443728171755, "train/lr": 0.0009997960964140947, "train/loss": 8.196192741394043, "train/global_grad_norm": 0.9726490378379822} +{"step": 62914560, "train/token_count": 62914560, "train/batch_count": 30, "train/flop_count": 0, "train/total_time": 306.0479498610366, "train/update_time": 204.88854807184543, "train/lr": 0.0009990914580222257, "train/loss": 7.71175479888916, "train/global_grad_norm": 0.40494367480278015} +{"step": 83886080, "train/token_count": 83886080, "train/batch_count": 40, "train/flop_count": 0, "train/total_time": 373.6941461900715, "train/update_time": 272.43619964295067, "train/lr": 0.0009978842768382998, "train/loss": 7.524440765380859, "train/global_grad_norm": 0.5722494721412659} +{"step": 104857600, "train/token_count": 104857600, "train/batch_count": 50, "train/flop_count": 0, "train/total_time": 541.7261677470524, "train/update_time": 339.9664637759561, "train/lr": 0.0009961757683914405, "train/loss": 7.3614912033081055, "train/global_grad_norm": 0.20995093882083893} +{"step": 125829120, "train/token_count": 125829120, "train/batch_count": 60, "train/flop_count": 0, "train/total_time": 609.3696456589969, "train/update_time": 407.4915405898355, "train/lr": 0.00099396765300483, "train/loss": 7.195133209228516, "train/global_grad_norm": 0.15112251043319702} +{"step": 146800640, "train/token_count": 146800640, "train/batch_count": 70, "train/flop_count": 0, "train/total_time": 776.8270515420008, "train/update_time": 474.9853536799783, "train/lr": 0.0009912621540634887, "train/loss": 7.097200393676758, "train/global_grad_norm": 0.3083263337612152} +{"step": 167772160, "train/token_count": 167772160, "train/batch_count": 80, "train/flop_count": 0, "train/total_time": 844.4406513640424, "train/update_time": 542.4893449847586, "train/lr": 0.000988061995775515, "train/loss": 6.98495626449585, "train/global_grad_norm": 0.2213425636291504} +{"step": 188743680, "train/token_count": 188743680, "train/batch_count": 90, "train/flop_count": 0, "train/total_time": 1012.7395372070605, "train/update_time": 610.0022338308627, "train/lr": 0.0009843704004290394, "train/loss": 6.863668441772461, "train/global_grad_norm": 0.3979201912879944} +{"step": 209715200, "train/token_count": 209715200, "train/batch_count": 100, "train/flop_count": 0, "train/total_time": 1080.3595280270092, "train/update_time": 677.5131666237721, "train/lr": 0.0009801910851476522, "train/loss": 6.781008720397949, "train/global_grad_norm": 0.3312043845653534} +{"step": 230686720, "train/token_count": 230686720, "train/batch_count": 110, "train/flop_count": 0, "train/total_time": 1248.9534793440253, "train/update_time": 745.0211975647835, "train/lr": 0.0009755282581475768, "train/loss": 6.697778701782227, "train/global_grad_norm": 0.3066418468952179} +{"step": 251658240, "train/token_count": 251658240, "train/batch_count": 120, "train/flop_count": 0, "train/total_time": 1316.5735572939739, "train/update_time": 812.5169301189017, "train/lr": 0.0009703866145003512, "train/loss": 6.592831134796143, "train/global_grad_norm": 0.3352743089199066} +{"step": 272629760, "train/token_count": 272629760, "train/batch_count": 130, "train/flop_count": 0, "train/total_time": 1485.0034688690212, "train/update_time": 880.0041609148029, "train/lr": 0.0009647713314052896, "train/loss": 6.561812877655029, "train/global_grad_norm": 0.4492341876029968} +{"step": 293601280, "train/token_count": 293601280, "train/batch_count": 140, "train/flop_count": 0, "train/total_time": 1552.605858018971, "train/update_time": 947.4958640788682, "train/lr": 0.0009586880629764817, "train/loss": 6.504783630371094, "train/global_grad_norm": 0.2978050410747528} +{"step": 314572800, "train/token_count": 314572800, "train/batch_count": 150, "train/flop_count": 0, "train/total_time": 1720.6480030510575, "train/update_time": 1014.9766604538308, "train/lr": 0.0009521429345495787, "train/loss": 6.405797481536865, "train/global_grad_norm": 0.5476596355438232} +{"step": 335544320, "train/token_count": 335544320, "train/batch_count": 160, "train/flop_count": 0, "train/total_time": 1788.244720379007, "train/update_time": 1082.4689192509977, "train/lr": 0.0009451425365140996, "train/loss": 6.375470161437988, "train/global_grad_norm": 0.332520067691803} +{"step": 356515840, "train/token_count": 356515840, "train/batch_count": 170, "train/flop_count": 0, "train/total_time": 1956.1968261280563, "train/update_time": 1149.9545339199249, "train/lr": 0.000937693917677468, "train/loss": 6.300417423248291, "train/global_grad_norm": 0.7200642824172974} +{"step": 377487360, "train/token_count": 377487360, "train/batch_count": 180, "train/flop_count": 0, "train/total_time": 2023.8116153230658, "train/update_time": 1217.4519352857023, "train/lr": 0.0009298045781674596, "train/loss": 6.269016265869141, "train/global_grad_norm": 0.29608145356178284} +{"step": 398458880, "train/token_count": 398458880, "train/batch_count": 190, "train/flop_count": 0, "train/total_time": 2191.784361746977, "train/update_time": 1284.9417869256577, "train/lr": 0.0009214824618802108, "train/loss": 6.286498546600342, "train/global_grad_norm": 0.6701951026916504} +{"step": 419430400, "train/token_count": 419430400, "train/batch_count": 200, "train/flop_count": 0, "train/total_time": 2259.3751640570117, "train/update_time": 1352.4327959185466, "train/lr": 0.000912735948481387, "train/loss": 6.202700614929199, "train/global_grad_norm": 0.309689998626709} +{"step": 440401920, "train/token_count": 440401920, "train/batch_count": 210, "train/flop_count": 0, "train/total_time": 2427.812098467024, "train/update_time": 1419.9271084357752, "train/lr": 0.0009035738449685707, "train/loss": 6.156064510345459, "train/global_grad_norm": 0.29083284735679626} +{"step": 461373440, "train/token_count": 461373440, "train/batch_count": 220, "train/flop_count": 0, "train/total_time": 2495.436698611011, "train/update_time": 1487.4421038717264, "train/lr": 0.0008940053768033609, "train/loss": 6.145916938781738, "train/global_grad_norm": 0.4619344472885132} +{"step": 482344960, "train/token_count": 482344960, "train/batch_count": 230, "train/flop_count": 0, "train/total_time": 2663.784864959074, "train/update_time": 1554.9646949897287, "train/lr": 0.0008840401786221159, "train/loss": 6.098778247833252, "train/global_grad_norm": 0.38272926211357117} +{"step": 503316480, "train/token_count": 503316480, "train/batch_count": 240, "train/flop_count": 0, "train/total_time": 2731.4342436430743, "train/update_time": 1622.4905407206388, "train/lr": 0.0008736882845346905, "train/loss": 6.057912349700928, "train/global_grad_norm": 0.46633926033973694} +{"step": 524288000, "train/token_count": 524288000, "train/batch_count": 250, "train/flop_count": 0, "train/total_time": 2900.0770314050606, "train/update_time": 1690.0012620057678, "train/lr": 0.0008629601180209381, "train/loss": 6.051210880279541, "train/global_grad_norm": 0.3780326843261719} +{"step": 545259520, "train/token_count": 545259520, "train/batch_count": 260, "train/flop_count": 0, "train/total_time": 2967.7298853070242, "train/update_time": 1757.5320481728995, "train/lr": 0.0008518664814351503, "train/loss": 6.018684387207031, "train/global_grad_norm": 0.4844052195549011} +{"step": 566231040, "train/token_count": 566231040, "train/batch_count": 270, "train/flop_count": 0, "train/total_time": 3136.192894882057, "train/update_time": 1825.0568903158419, "train/lr": 0.0008404185451290017, "train/loss": 6.0103936195373535, "train/global_grad_norm": 0.3090081512928009} +{"step": 587202560, "train/token_count": 587202560, "train/batch_count": 280, "train/flop_count": 0, "train/total_time": 3203.840811592061, "train/update_time": 1892.5856667858316, "train/lr": 0.0008286278362039527, "train/loss": 5.959850311279297, "train/global_grad_norm": 0.3670918345451355} +{"step": 608174080, "train/token_count": 608174080, "train/batch_count": 290, "train/flop_count": 0, "train/total_time": 3371.6769031099975, "train/update_time": 1960.1043883458478, "train/lr": 0.0008165062269044352, "train/loss": 5.930871486663818, "train/global_grad_norm": 0.5652802586555481} +{"step": 629145600, "train/token_count": 629145600, "train/batch_count": 300, "train/flop_count": 0, "train/total_time": 3439.312252484029, "train/update_time": 2027.637110557058, "train/lr": 0.0008040659226635089, "train/loss": 5.914786338806152, "train/global_grad_norm": 0.3531936705112457} +{"step": 650117120, "train/token_count": 650117120, "train/batch_count": 310, "train/flop_count": 0, "train/total_time": 3607.989511896041, "train/update_time": 2095.1480140229687, "train/lr": 0.0007913194498130252, "train/loss": 5.9558491706848145, "train/global_grad_norm": 0.31624099612236023} +{"step": 671088640, "train/token_count": 671088640, "train/batch_count": 320, "train/flop_count": 0, "train/total_time": 3675.6837699849857, "train/update_time": 2162.6913318177685, "train/lr": 0.000778279642970672, "train/loss": 5.875686168670654, "train/global_grad_norm": 0.45401549339294434} +{"step": 692060160, "train/token_count": 692060160, "train/batch_count": 330, "train/flop_count": 0, "train/total_time": 3843.81031891203, "train/update_time": 2230.2011163331335, "train/lr": 0.0007649596321166025, "train/loss": 5.9156365394592285, "train/global_grad_norm": 0.3709852695465088} +{"step": 713031680, "train/token_count": 713031680, "train/batch_count": 340, "train/flop_count": 0, "train/total_time": 3911.4375108770328, "train/update_time": 2297.7100352901034, "train/lr": 0.0007513728293726579, "train/loss": 5.878309726715088, "train/global_grad_norm": 0.3850482106208801} +{"step": 734003200, "train/token_count": 734003200, "train/batch_count": 350, "train/flop_count": 0, "train/total_time": 4079.354542599991, "train/update_time": 2365.248016706202, "train/lr": 0.0007375329154974975, "train/loss": 5.850311756134033, "train/global_grad_norm": 0.3051223158836365} +{"step": 754974720, "train/token_count": 754974720, "train/batch_count": 360, "train/flop_count": 0, "train/total_time": 4147.001440979075, "train/update_time": 2432.7737647151807, "train/lr": 0.0007234538261112341, "train/loss": 5.786566734313965, "train/global_grad_norm": 0.3867356479167938} +{"step": 775946240, "train/token_count": 775946240, "train/batch_count": 370, "train/flop_count": 0, "train/total_time": 4315.090891462984, "train/update_time": 2500.2843339033425, "train/lr": 0.0007091497376634464, "train/loss": 5.8186936378479, "train/global_grad_norm": 0.3156871199607849} +{"step": 796917760, "train/token_count": 796917760, "train/batch_count": 380, "train/flop_count": 0, "train/total_time": 4382.723289367976, "train/update_time": 2567.801604798413, "train/lr": 0.0006946350531586958, "train/loss": 5.791891098022461, "train/global_grad_norm": 0.36183497309684753} +{"step": 817889280, "train/token_count": 817889280, "train/batch_count": 390, "train/flop_count": 0, "train/total_time": 4550.521985811065, "train/update_time": 2635.3166619893163, "train/lr": 0.0006799243876539214, "train/loss": 5.8107523918151855, "train/global_grad_norm": 0.49777495861053467} +{"step": 838860800, "train/token_count": 838860800, "train/batch_count": 400, "train/flop_count": 0, "train/total_time": 4618.154446462984, "train/update_time": 2702.834149704431, "train/lr": 0.0006650325535423166, "train/loss": 5.713278293609619, "train/global_grad_norm": 0.3702220022678375} +{"step": 859832320, "train/token_count": 859832320, "train/batch_count": 410, "train/flop_count": 0, "train/total_time": 4786.292578452965, "train/update_time": 2770.3491651542718, "train/lr": 0.0006499745456385053, "train/loss": 5.73615026473999, "train/global_grad_norm": 0.4145315885543823} +{"step": 880803840, "train/token_count": 880803840, "train/batch_count": 420, "train/flop_count": 0, "train/total_time": 4853.921031603008, "train/update_time": 2837.8658607284306, "train/lr": 0.0006347655260800339, "train/loss": 5.749305248260498, "train/global_grad_norm": 0.38909024000167847} +{"step": 901775360, "train/token_count": 901775360, "train/batch_count": 430, "train/flop_count": 0, "train/total_time": 5021.374068334, "train/update_time": 2905.373775637592, "train/lr": 0.0006194208090603844, "train/loss": 5.747371673583984, "train/global_grad_norm": 0.4385876953601837} +{"step": 922746880, "train/token_count": 922746880, "train/batch_count": 440, "train/flop_count": 0, "train/total_time": 5089.012363151065, "train/update_time": 2972.8906648436096, "train/lr": 0.0006039558454088796, "train/loss": 5.779524326324463, "train/global_grad_norm": 0.4157894253730774} +{"step": 943718400, "train/token_count": 943718400, "train/batch_count": 450, "train/flop_count": 0, "train/total_time": 5256.981477431022, "train/update_time": 3040.4158616837813, "train/lr": 0.0005883862070330078, "train/loss": 5.7228264808654785, "train/global_grad_norm": 0.3932007849216461} +{"step": 964689920, "train/token_count": 964689920, "train/batch_count": 460, "train/flop_count": 0, "train/total_time": 5324.629123619059, "train/update_time": 3107.9427982217167, "train/lr": 0.0005727275712388317, "train/loss": 5.6978583335876465, "train/global_grad_norm": 0.45659518241882324} +{"step": 985661440, "train/token_count": 985661440, "train/batch_count": 470, "train/flop_count": 0, "train/total_time": 5492.093443386024, "train/update_time": 3175.472754184855, "train/lr": 0.0005569957049452703, "train/loss": 5.713237285614014, "train/global_grad_norm": 0.351923406124115} +{"step": 1006632960, "train/token_count": 1006632960, "train/batch_count": 480, "train/flop_count": 0, "train/total_time": 5559.744059987017, "train/update_time": 3242.99941329984, "train/lr": 0.0005412064488081482, "train/loss": 5.698825836181641, "train/global_grad_norm": 0.591474175453186} +{"step": 1027604480, "train/token_count": 1027604480, "train/batch_count": 490, "train/flop_count": 0, "train/total_time": 5727.743233548012, "train/update_time": 3310.5120258319657, "train/lr": 0.0005253757012699972, "train/loss": 5.690817356109619, "train/global_grad_norm": 0.28637227416038513} +{"step": 1048576000, "train/token_count": 1048576000, "train/batch_count": 500, "train/flop_count": 0, "train/total_time": 5795.387424343033, "train/update_time": 3378.042265718919, "train/lr": 0.0005095194025516734, "train/loss": 5.679786682128906, "train/global_grad_norm": 0.34895920753479004} +{"step": 1069547520, "train/token_count": 1069547520, "train/batch_count": 510, "train/flop_count": 0, "train/total_time": 5963.751827322063, "train/update_time": 3445.5543855928117, "train/lr": 0.0004936535186019053, "train/loss": 5.677963733673096, "train/global_grad_norm": 0.4498981237411499} +{"step": 1090519040, "train/token_count": 1090519040, "train/batch_count": 520, "train/flop_count": 0, "train/total_time": 6031.410010787076, "train/update_time": 3513.0946629439713, "train/lr": 0.00047779402502093696, "train/loss": 5.6720685958862305, "train/global_grad_norm": 0.3658086061477661} +{"step": 1111490560, "train/token_count": 1111490560, "train/batch_count": 530, "train/flop_count": 0, "train/total_time": 6198.96821509907, "train/update_time": 3580.623477048124, "train/lr": 0.0004619568909744525, "train/loss": 5.642226696014404, "train/global_grad_norm": 0.36008358001708984} +{"step": 1132462080, "train/token_count": 1132462080, "train/batch_count": 540, "train/flop_count": 0, "train/total_time": 6266.654250354972, "train/update_time": 3648.1618733102223, "train/lr": 0.00044615806311398067, "train/loss": 5.654401779174805, "train/global_grad_norm": 0.49052467942237854} +{"step": 1153433600, "train/token_count": 1153433600, "train/batch_count": 550, "train/flop_count": 0, "train/total_time": 6434.099346054019, "train/update_time": 3715.6762271750486, "train/lr": 0.0004304134495199673, "train/loss": 5.601498126983643, "train/global_grad_norm": 0.36875054240226746} +{"step": 1174405120, "train/token_count": 1174405120, "train/batch_count": 560, "train/flop_count": 0, "train/total_time": 6501.729716927046, "train/update_time": 3783.1945504179457, "train/lr": 0.0004147389036836882, "train/loss": 5.638054847717285, "train/global_grad_norm": 0.3628014028072357} +{"step": 1195376640, "train/token_count": 1195376640, "train/batch_count": 570, "train/flop_count": 0, "train/total_time": 6669.622392687015, "train/update_time": 3850.7147646707017, "train/lr": 0.0003991502085441259, "train/loss": 5.593812465667725, "train/global_grad_norm": 0.40251973271369934} +{"step": 1216348160, "train/token_count": 1216348160, "train/batch_count": 580, "train/flop_count": 0, "train/total_time": 6737.260854291031, "train/update_time": 3918.229618334677, "train/lr": 0.0003836630605958888, "train/loss": 5.651180267333984, "train/global_grad_norm": 0.41545599699020386} +{"step": 1237319680, "train/token_count": 1237319680, "train/batch_count": 590, "train/flop_count": 0, "train/total_time": 6905.097716190037, "train/update_time": 3985.7243089615367, "train/lr": 0.00036829305408417155, "train/loss": 5.651818752288818, "train/global_grad_norm": 0.2855614423751831} +{"step": 1258291200, "train/token_count": 1258291200, "train/batch_count": 600, "train/flop_count": 0, "train/total_time": 6972.698619629024, "train/update_time": 4053.2279500714503, "train/lr": 0.000353055665302672, "train/loss": 5.643282890319824, "train/global_grad_norm": 0.3338489830493927} +{"step": 1279262720, "train/token_count": 1279262720, "train/batch_count": 610, "train/flop_count": 0, "train/total_time": 7141.385740551981, "train/update_time": 4120.727246251656, "train/lr": 0.0003379662370102746, "train/loss": 5.617847442626953, "train/global_grad_norm": 0.33251237869262695} +{"step": 1300234240, "train/token_count": 1300234240, "train/batch_count": 620, "train/flop_count": 0, "train/total_time": 7208.982709518052, "train/update_time": 4188.228603066411, "train/lr": 0.00032303996298219405, "train/loss": 5.588885307312012, "train/global_grad_norm": 0.3960968255996704} +{"step": 1321205760, "train/token_count": 1321205760, "train/batch_count": 630, "train/flop_count": 0, "train/total_time": 7376.684962157044, "train/update_time": 4255.729584518704, "train/lr": 0.00030829187271113034, "train/loss": 5.613091945648193, "train/global_grad_norm": 0.36357083916664124} +{"step": 1342177280, "train/token_count": 1342177280, "train/batch_count": 640, "train/flop_count": 0, "train/total_time": 7444.285434685065, "train/update_time": 4323.2266572538065, "train/lr": 0.0002937368162738445, "train/loss": 5.598479270935059, "train/global_grad_norm": 0.35109350085258484} +{"step": 1363148800, "train/token_count": 1363148800, "train/batch_count": 650, "train/flop_count": 0, "train/total_time": 7612.232262870064, "train/update_time": 4390.719008370885, "train/lr": 0.0002793894493783894, "train/loss": 5.568142890930176, "train/global_grad_norm": 0.38976845145225525} +{"step": 1384120320, "train/token_count": 1384120320, "train/batch_count": 660, "train/flop_count": 0, "train/total_time": 7679.831617571064, "train/update_time": 4458.218750593718, "train/lr": 0.00026526421860705474, "train/loss": 5.5909423828125, "train/global_grad_norm": 0.3374827802181244} +{"step": 1405091840, "train/token_count": 1405091840, "train/batch_count": 670, "train/flop_count": 0, "train/total_time": 7847.450622586999, "train/update_time": 4525.717398178764, "train/lr": 0.0002513753468698824, "train/loss": 5.548323631286621, "train/global_grad_norm": 0.35127025842666626} +{"step": 1426063360, "train/token_count": 1426063360, "train/batch_count": 680, "train/flop_count": 0, "train/total_time": 7915.0508393970085, "train/update_time": 4593.218119100784, "train/lr": 0.00023773681908340283, "train/loss": 5.565690517425537, "train/global_grad_norm": 0.32669878005981445} +{"step": 1447034880, "train/token_count": 1447034880, "train/batch_count": 690, "train/flop_count": 0, "train/total_time": 8083.225679086056, "train/update_time": 4660.71875171666, "train/lr": 0.00022436236808900823, "train/loss": 5.576150894165039, "train/global_grad_norm": 0.33521243929862976} +{"step": 1468006400, "train/token_count": 1468006400, "train/batch_count": 700, "train/flop_count": 0, "train/total_time": 8150.834610218997, "train/update_time": 4728.229743778473, "train/lr": 0.00021126546082514682, "train/loss": 5.5722150802612305, "train/global_grad_norm": 0.3350072205066681} +{"step": 1488977920, "train/token_count": 1488977920, "train/batch_count": 710, "train/flop_count": 0, "train/total_time": 8318.88102039101, "train/update_time": 4795.734360322589, "train/lr": 0.00019845928476725522, "train/loss": 5.558324337005615, "train/global_grad_norm": 0.29043614864349365} +{"step": 1509949440, "train/token_count": 1509949440, "train/batch_count": 720, "train/flop_count": 0, "train/total_time": 8386.480246171006, "train/update_time": 4863.235517622321, "train/lr": 0.0001859567346490913, "train/loss": 5.551104545593262, "train/global_grad_norm": 0.31684166193008423} +{"step": 1530920960, "train/token_count": 1530920960, "train/batch_count": 730, "train/flop_count": 0, "train/total_time": 8554.17625815398, "train/update_time": 4930.740559646045, "train/lr": 0.00017377039947882782, "train/loss": 5.5726318359375, "train/global_grad_norm": 0.32063278555870056} +{"step": 1551892480, "train/token_count": 1551892480, "train/batch_count": 740, "train/flop_count": 0, "train/total_time": 8621.791595970979, "train/update_time": 4998.250562901143, "train/lr": 0.00016191254986299043, "train/loss": 5.5637898445129395, "train/global_grad_norm": 0.34598803520202637} +{"step": 1572864000, "train/token_count": 1572864000, "train/batch_count": 750, "train/flop_count": 0, "train/total_time": 8789.583836077014, "train/update_time": 5065.748505146126, "train/lr": 0.00015039512565099468, "train/loss": 5.560715675354004, "train/global_grad_norm": 0.327943354845047} +{"step": 1593835520, "train/token_count": 1593835520, "train/batch_count": 760, "train/flop_count": 0, "train/total_time": 8857.199786913, "train/update_time": 5133.253916564281, "train/lr": 0.00013922972391273224, "train/loss": 5.501072883605957, "train/global_grad_norm": 0.29274871945381165} +{"step": 1614807040, "train/token_count": 1614807040, "train/batch_count": 770, "train/flop_count": 0, "train/total_time": 9025.246275466983, "train/update_time": 5200.750415898627, "train/lr": 0.00012842758726130281, "train/loss": 5.579266548156738, "train/global_grad_norm": 0.2941227853298187} +{"step": 1635778560, "train/token_count": 1635778560, "train/batch_count": 780, "train/flop_count": 0, "train/total_time": 9092.876169198076, "train/update_time": 5268.2632564986125, "train/lr": 0.00011799959253265679, "train/loss": 5.490416526794434, "train/global_grad_norm": 0.2656475007534027} +{"step": 1656750080, "train/token_count": 1656750080, "train/batch_count": 790, "train/flop_count": 0, "train/total_time": 9260.972570382059, "train/update_time": 5335.757341830526, "train/lr": 0.00010795623983354214, "train/loss": 5.527663230895996, "train/global_grad_norm": 0.26042425632476807} +{"step": 1677721600, "train/token_count": 1677721600, "train/batch_count": 800, "train/flop_count": 0, "train/total_time": 9328.580250757048, "train/update_time": 5403.266901573632, "train/lr": 9.830764196878872e-05, "train/loss": 5.501196384429932, "train/global_grad_norm": 0.258548378944397} +{"step": 1698693120, "train/token_count": 1698693120, "train/batch_count": 810, "train/flop_count": 0, "train/total_time": 9497.46972536901, "train/update_time": 5470.770781763713, "train/lr": 8.906351425856951e-05, "train/loss": 5.472145080566406, "train/global_grad_norm": 0.2814105153083801} +{"step": 1719664640, "train/token_count": 1719664640, "train/batch_count": 820, "train/flop_count": 0, "train/total_time": 9565.069234885043, "train/update_time": 5538.269277713727, "train/lr": 8.02331647558977e-05, "train/loss": 5.506821155548096, "train/global_grad_norm": 0.26193347573280334} +{"step": 1740636160, "train/token_count": 1740636160, "train/batch_count": 830, "train/flop_count": 0, "train/total_time": 9732.95014503505, "train/update_time": 5605.777275754837, "train/lr": 7.182548487420554e-05, "train/loss": 5.532543182373047, "train/global_grad_norm": 0.22665195167064667} +{"step": 1761607680, "train/token_count": 1761607680, "train/batch_count": 840, "train/flop_count": 0, "train/total_time": 9800.571015445981, "train/update_time": 5673.296408364549, "train/lr": 6.384894043444556e-05, "train/loss": 5.483277320861816, "train/global_grad_norm": 0.2179127037525177} +{"step": 1782579200, "train/token_count": 1782579200, "train/batch_count": 850, "train/flop_count": 0, "train/total_time": 9968.994457261055, "train/update_time": 5740.817306284327, "train/lr": 5.6311563140726166e-05, "train/loss": 5.565979957580566, "train/global_grad_norm": 0.2223806530237198} +{"step": 1803550720, "train/token_count": 1803550720, "train/batch_count": 860, "train/flop_count": 0, "train/total_time": 10036.600267508999, "train/update_time": 5808.320603007567, "train/lr": 4.922094249306547e-05, "train/loss": 5.534444808959961, "train/global_grad_norm": 0.2236238718032837} +{"step": 1824522240, "train/token_count": 1824522240, "train/batch_count": 870, "train/flop_count": 0, "train/total_time": 10204.47946456098, "train/update_time": 5875.824670242728, "train/lr": 4.2584218145409916e-05, "train/loss": 5.472100734710693, "train/global_grad_norm": 0.21749918162822723} +{"step": 1845493760, "train/token_count": 1845493760, "train/batch_count": 880, "train/flop_count": 0, "train/total_time": 10272.080307441996, "train/update_time": 5943.3299956357805, "train/lr": 3.6408072716606236e-05, "train/loss": 5.504454135894775, "train/global_grad_norm": 0.20975066721439362} +{"step": 1866465280, "train/token_count": 1866465280, "train/batch_count": 890, "train/flop_count": 0, "train/total_time": 10440.040362586034, "train/update_time": 6010.818879780243, "train/lr": 3.069872506157217e-05, "train/loss": 5.567077159881592, "train/global_grad_norm": 0.20836369693279266} +{"step": 1887436800, "train/token_count": 1887436800, "train/batch_count": 900, "train/flop_count": 0, "train/total_time": 10507.649754095008, "train/update_time": 6078.3277447193395, "train/lr": 2.5461924009435368e-05, "train/loss": 5.464453220367432, "train/global_grad_norm": 0.2269560843706131} +{"step": 1908408320, "train/token_count": 1908408320, "train/batch_count": 910, "train/flop_count": 0, "train/total_time": 10707.957879410009, "train/update_time": 6177.141915226472, "train/lr": 2.0702942574950812e-05, "train/loss": 5.509556770324707, "train/global_grad_norm": 0.2095288634300232} +{"step": 1929379840, "train/token_count": 1929379840, "train/batch_count": 920, "train/flop_count": 0, "train/total_time": 10899.09726011497, "train/update_time": 6367.991133535514, "train/lr": 1.642657264902142e-05, "train/loss": 5.54543924331665, "train/global_grad_norm": 0.20035825669765472} +{"step": 1950351360, "train/token_count": 1950351360, "train/batch_count": 930, "train/flop_count": 0, "train/total_time": 11285.98193423904, "train/update_time": 6497.941741914721, "train/lr": 1.2637120173670358e-05, "train/loss": 5.523913383483887, "train/global_grad_norm": 0.18863900005817413} +{"step": 1971322880, "train/token_count": 1971322880, "train/batch_count": 940, "train/flop_count": 0, "train/total_time": 11471.464317596983, "train/update_time": 6683.226909449557, "train/lr": 9.338400806321978e-06, "train/loss": 5.4696879386901855, "train/global_grad_norm": 0.19305512309074402} +{"step": 1992294400, "train/token_count": 1992294400, "train/batch_count": 950, "train/flop_count": 0, "train/total_time": 11871.571419561049, "train/update_time": 6813.441040770616, "train/lr": 6.533736077758867e-06, "train/loss": 5.509196758270264, "train/global_grad_norm": 0.18883228302001953} +{"step": 2013265920, "train/token_count": 2013265920, "train/batch_count": 960, "train/flop_count": 0, "train/total_time": 12034.565373313962, "train/update_time": 6976.259414628497, "train/lr": 4.2259500476214406e-06, "train/loss": 5.492060661315918, "train/global_grad_norm": 0.18720072507858276} +{"step": 2034237440, "train/token_count": 2034237440, "train/batch_count": 970, "train/flop_count": 0, "train/total_time": 12449.846788060968, "train/update_time": 7106.677357654553, "train/lr": 2.417366460819359e-06, "train/loss": 5.52823543548584, "train/global_grad_norm": 0.1909027099609375} +{"step": 2055208960, "train/token_count": 2055208960, "train/batch_count": 980, "train/flop_count": 0, "train/total_time": 12586.090243482962, "train/update_time": 7242.765970333596, "train/lr": 1.1098064077174619e-06, "train/loss": 5.510114669799805, "train/global_grad_norm": 0.17430830001831055} diff --git a/metrics/jsonlines/train_eval.jsonl b/metrics/jsonlines/train_eval.jsonl index 2fb4c047a55861fd05aee285ee3902ef1452cf82..a9b390795add7229561ed900ef19ebc7a63073d9 100644 --- a/metrics/jsonlines/train_eval.jsonl +++ b/metrics/jsonlines/train_eval.jsonl @@ -1,19 +1,19 @@ -{"step": 104857600, "train_eval/train_token_count": 104857600, "train_eval/train_batch_count": 50, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 464.0044742010068, "train_eval/train_update_time": 282.4727703850367, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 8.262765104495848, "train_eval/perplexity_len_2048": 3876.7990479882474, "train_eval/loss_avg_len_1024": 8.26361274068222, "train_eval/perplexity_len_1024": 3880.086556257262, "train_eval/loss_avg_len_512": 8.264419558200608, "train_eval/perplexity_len_512": 3883.218341283336} -{"step": 209715200, "train_eval/train_token_count": 209715200, "train_eval/train_batch_count": 100, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 923.5610087590176, "train_eval/train_update_time": 561.0979154942906, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 6.399099997472659, "train_eval/perplexity_len_2048": 601.3036194924304, "train_eval/loss_avg_len_1024": 6.403366397288846, "train_eval/perplexity_len_1024": 603.8745014496265, "train_eval/loss_avg_len_512": 6.409683007578133, "train_eval/perplexity_len_512": 607.7010139099035} -{"step": 314572800, "train_eval/train_token_count": 314572800, "train_eval/train_batch_count": 150, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 1473.913084711996, "train_eval/train_update_time": 839.7419353383593, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 5.693106125889135, "train_eval/perplexity_len_2048": 296.8141323485163, "train_eval/loss_avg_len_1024": 5.698990526291018, "train_eval/perplexity_len_1024": 298.5658544105799, "train_eval/loss_avg_len_512": 5.710699294427177, "train_eval/perplexity_len_512": 302.08223886516663} -{"step": 419430400, "train_eval/train_token_count": 419430400, "train_eval/train_batch_count": 200, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 1933.6392927800189, "train_eval/train_update_time": 1118.3960400532233, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 5.296922603823786, "train_eval/perplexity_len_2048": 199.7212419010431, "train_eval/loss_avg_len_1024": 5.305337436088958, "train_eval/perplexity_len_1024": 201.40895359804367, "train_eval/loss_avg_len_512": 5.320490509328956, "train_eval/perplexity_len_512": 204.48415878511435} -{"step": 524288000, "train_eval/train_token_count": 524288000, "train_eval/train_batch_count": 250, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 2484.2231900609913, "train_eval/train_update_time": 1397.0756743992679, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 5.045304426316234, "train_eval/perplexity_len_2048": 155.29156684287375, "train_eval/loss_avg_len_1024": 5.053415232166262, "train_eval/perplexity_len_1024": 156.55622837075202, "train_eval/loss_avg_len_512": 5.070610678311423, "train_eval/perplexity_len_512": 159.27156133906544} -{"step": 629145600, "train_eval/train_token_count": 629145600, "train_eval/train_batch_count": 300, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 2944.979424642981, "train_eval/train_update_time": 1675.8085315313656, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 4.881278812076035, "train_eval/perplexity_len_2048": 131.79910244606154, "train_eval/loss_avg_len_1024": 4.889370008184379, "train_eval/perplexity_len_1024": 132.86984076618447, "train_eval/loss_avg_len_512": 4.908291251527554, "train_eval/perplexity_len_512": 135.40783867497828} -{"step": 734003200, "train_eval/train_token_count": 734003200, "train_eval/train_batch_count": 350, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 3496.0525263110176, "train_eval/train_update_time": 1954.5468697096221, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 4.752543167285239, "train_eval/perplexity_len_2048": 115.87860879757943, "train_eval/loss_avg_len_1024": 4.763826194274043, "train_eval/perplexity_len_1024": 117.19347414945925, "train_eval/loss_avg_len_512": 4.785651780011176, "train_eval/perplexity_len_512": 119.77940747075029} -{"step": 838860800, "train_eval/train_token_count": 838860800, "train_eval/train_batch_count": 400, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 3957.9235017200117, "train_eval/train_update_time": 2233.291398033558, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 4.653148743151705, "train_eval/perplexity_len_2048": 104.91481583709675, "train_eval/loss_avg_len_1024": 4.6641259991965125, "train_eval/perplexity_len_1024": 106.07283695212364, "train_eval/loss_avg_len_512": 4.687856853806879, "train_eval/perplexity_len_512": 108.62014133645553} -{"step": 943718400, "train_eval/train_token_count": 943718400, "train_eval/train_batch_count": 450, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 4509.209549701016, "train_eval/train_update_time": 2512.043382478296, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 4.56949279251452, "train_eval/perplexity_len_2048": 96.49515429403105, "train_eval/loss_avg_len_1024": 4.584133220926888, "train_eval/perplexity_len_1024": 97.91827683495046, "train_eval/loss_avg_len_512": 4.612269650588205, "train_eval/perplexity_len_512": 100.7124725543258} -{"step": 1048576000, "train_eval/train_token_count": 1048576000, "train_eval/train_batch_count": 500, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 4969.727964510967, "train_eval/train_update_time": 2790.797476610227, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 4.500111393837432, "train_eval/perplexity_len_2048": 90.02715921272548, "train_eval/loss_avg_len_1024": 4.5146006559921075, "train_eval/perplexity_len_1024": 91.34108222421936, "train_eval/loss_avg_len_512": 4.545015140839531, "train_eval/perplexity_len_512": 94.16185288811836} -{"step": 1153433600, "train_eval/train_token_count": 1153433600, "train_eval/train_batch_count": 550, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 5521.543946499005, "train_eval/train_update_time": 3069.52293490601, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 4.434400985772954, "train_eval/perplexity_len_2048": 84.30161189591196, "train_eval/loss_avg_len_1024": 4.448710203694063, "train_eval/perplexity_len_1024": 85.51657387892722, "train_eval/loss_avg_len_512": 4.479653784418624, "train_eval/perplexity_len_512": 88.20412974467796} -{"step": 1258291200, "train_eval/train_token_count": 1258291200, "train_eval/train_batch_count": 600, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 5981.512405745976, "train_eval/train_update_time": 3348.2465934828506, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 4.3785154093765595, "train_eval/perplexity_len_2048": 79.71959454765936, "train_eval/loss_avg_len_1024": 4.3939165947328, "train_eval/perplexity_len_1024": 80.95687412946403, "train_eval/loss_avg_len_512": 4.429078622167507, "train_eval/perplexity_len_512": 83.85411997858606} -{"step": 1363148800, "train_eval/train_token_count": 1363148800, "train_eval/train_batch_count": 650, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 6532.972345999966, "train_eval/train_update_time": 3627.0106727198, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 4.332845308472933, "train_eval/perplexity_len_2048": 76.16067919713348, "train_eval/loss_avg_len_1024": 4.354710251906472, "train_eval/perplexity_len_1024": 77.84426684093609, "train_eval/loss_avg_len_512": 4.39427122400477, "train_eval/perplexity_len_512": 80.98558889804535} -{"step": 1468006400, "train_eval/train_token_count": 1468006400, "train_eval/train_batch_count": 700, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 6992.639220207988, "train_eval/train_update_time": 3905.7538149688044, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 4.286469663051848, "train_eval/perplexity_len_2048": 72.70932644602816, "train_eval/loss_avg_len_1024": 4.309569447513205, "train_eval/perplexity_len_1024": 74.40844530144317, "train_eval/loss_avg_len_512": 4.354222116721867, "train_eval/perplexity_len_512": 77.80627758807101} -{"step": 1572864000, "train_eval/train_token_count": 1572864000, "train_eval/train_batch_count": 750, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 7543.42272252898, "train_eval/train_update_time": 4184.450604122656, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 4.249375205130373, "train_eval/perplexity_len_2048": 70.06162452534807, "train_eval/loss_avg_len_1024": 4.273874875287056, "train_eval/perplexity_len_1024": 71.7993106682297, "train_eval/loss_avg_len_512": 4.321713214736082, "train_eval/perplexity_len_512": 75.31755296428902} -{"step": 1677721600, "train_eval/train_token_count": 1677721600, "train_eval/train_batch_count": 800, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 8003.3749305050005, "train_eval/train_update_time": 4463.172267011658, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 4.2222635463871026, "train_eval/perplexity_len_2048": 68.18765565818079, "train_eval/loss_avg_len_1024": 4.249677161750205, "train_eval/perplexity_len_1024": 70.08278329102363, "train_eval/loss_avg_len_512": 4.299936973010299, "train_eval/perplexity_len_512": 73.69514876983682} -{"step": 1782579200, "train_eval/train_token_count": 1782579200, "train_eval/train_batch_count": 850, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 8554.563296968001, "train_eval/train_update_time": 4741.9453661507, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 4.195244031412512, "train_eval/perplexity_len_2048": 66.36992594802435, "train_eval/loss_avg_len_1024": 4.218284104051746, "train_eval/perplexity_len_1024": 67.9168460075776, "train_eval/loss_avg_len_512": 4.2707039155407625, "train_eval/perplexity_len_512": 71.57199853357183} -{"step": 1887436800, "train_eval/train_token_count": 1887436800, "train_eval/train_batch_count": 900, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 9014.539005896018, "train_eval/train_update_time": 5020.712582220614, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 4.188622776587208, "train_eval/perplexity_len_2048": 65.93192541236388, "train_eval/loss_avg_len_1024": 4.215066284977402, "train_eval/perplexity_len_1024": 67.69865312590402, "train_eval/loss_avg_len_512": 4.268878927308142, "train_eval/perplexity_len_512": 71.4414995941971} -{"step": 1992294400, "train_eval/train_token_count": 1992294400, "train_eval/train_batch_count": 950, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 9565.561509469, "train_eval/train_update_time": 5299.418035702605, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 4.176184563983006, "train_eval/perplexity_len_2048": 65.11692916224894, "train_eval/loss_avg_len_1024": 4.205971465967996, "train_eval/perplexity_len_1024": 67.08573753155041, "train_eval/loss_avg_len_512": 4.259276238732308, "train_eval/perplexity_len_512": 70.75875247262296} +{"step": 104857600, "train_eval/train_token_count": 104857600, "train_eval/train_batch_count": 50, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 541.7261677470524, "train_eval/train_update_time": 339.9664637759561, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 8.431279315594583, "train_eval/perplexity_len_2048": 4588.366308362686, "train_eval/loss_avg_len_1024": 8.43578569962643, "train_eval/perplexity_len_1024": 4609.089908206309, "train_eval/loss_avg_len_512": 8.435858583673834, "train_eval/perplexity_len_512": 4609.4258495759} +{"step": 209715200, "train_eval/train_token_count": 209715200, "train_eval/train_batch_count": 100, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 1080.3595280270092, "train_eval/train_update_time": 677.5131666237721, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 7.043303485368378, "train_eval/perplexity_len_2048": 1145.164398741174, "train_eval/loss_avg_len_1024": 7.050072871062438, "train_eval/perplexity_len_1024": 1152.9427558858404, "train_eval/loss_avg_len_512": 7.055086852745153, "train_eval/perplexity_len_512": 1158.7381064947092} +{"step": 314572800, "train_eval/train_token_count": 314572800, "train_eval/train_batch_count": 150, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 1720.6480030510575, "train_eval/train_update_time": 1014.9766604538308, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 6.58473962082513, "train_eval/perplexity_len_2048": 723.9625183603853, "train_eval/loss_avg_len_1024": 6.593009713915525, "train_eval/perplexity_len_1024": 729.9745816768931, "train_eval/loss_avg_len_512": 6.6036262488353525, "train_eval/perplexity_len_512": 737.7656662966635} +{"step": 419430400, "train_eval/train_token_count": 419430400, "train_eval/train_batch_count": 200, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 2259.3751640570117, "train_eval/train_update_time": 1352.4327959185466, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 6.304370796141812, "train_eval/perplexity_len_2048": 546.9573322289459, "train_eval/loss_avg_len_1024": 6.315740904568084, "train_eval/perplexity_len_1024": 553.2117859284505, "train_eval/loss_avg_len_512": 6.329661400500045, "train_eval/perplexity_len_512": 560.9666186754808} +{"step": 524288000, "train_eval/train_token_count": 524288000, "train_eval/train_batch_count": 250, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 2900.0770314050606, "train_eval/train_update_time": 1690.0012620057678, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 6.118539933643442, "train_eval/perplexity_len_2048": 454.2010464652805, "train_eval/loss_avg_len_1024": 6.128848941022952, "train_eval/perplexity_len_1024": 458.90762680798815, "train_eval/loss_avg_len_512": 6.144599953355937, "train_eval/perplexity_len_512": 466.1931127637189} +{"step": 629145600, "train_eval/train_token_count": 629145600, "train_eval/train_batch_count": 300, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 3439.312252484029, "train_eval/train_update_time": 2027.637110557058, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 5.993265189037047, "train_eval/perplexity_len_2048": 400.7209056232889, "train_eval/loss_avg_len_1024": 6.003427277760493, "train_eval/perplexity_len_1024": 404.8138281180551, "train_eval/loss_avg_len_512": 6.020754306166491, "train_eval/perplexity_len_512": 411.8891691184047} +{"step": 734003200, "train_eval/train_token_count": 734003200, "train_eval/train_batch_count": 350, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 4079.354542599991, "train_eval/train_update_time": 2365.248016706202, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 5.893832671793534, "train_eval/perplexity_len_2048": 362.7930900146717, "train_eval/loss_avg_len_1024": 5.90759678772476, "train_eval/perplexity_len_1024": 367.82114011647366, "train_eval/loss_avg_len_512": 5.928100219366606, "train_eval/perplexity_len_512": 375.4405811366837} +{"step": 838860800, "train_eval/train_token_count": 838860800, "train_eval/train_batch_count": 400, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 4618.154446462984, "train_eval/train_update_time": 2702.834149704431, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 5.817333242833847, "train_eval/perplexity_len_2048": 336.074628131044, "train_eval/loss_avg_len_1024": 5.829878006978615, "train_eval/perplexity_len_1024": 340.31716022069065, "train_eval/loss_avg_len_512": 5.851326766426645, "train_eval/perplexity_len_512": 347.6953851520663} +{"step": 943718400, "train_eval/train_token_count": 943718400, "train_eval/train_batch_count": 450, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 5256.981477431022, "train_eval/train_update_time": 3040.4158616837813, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 5.752478665695435, "train_eval/perplexity_len_2048": 314.9703998548185, "train_eval/loss_avg_len_1024": 5.7688500021151405, "train_eval/perplexity_len_1024": 320.1693268063645, "train_eval/loss_avg_len_512": 5.79529527088307, "train_eval/perplexity_len_512": 328.74923973070185} +{"step": 1048576000, "train_eval/train_token_count": 1048576000, "train_eval/train_batch_count": 500, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 5795.387424343033, "train_eval/train_update_time": 3378.042265718919, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 5.702735505279561, "train_eval/perplexity_len_2048": 299.68607355137283, "train_eval/loss_avg_len_1024": 5.718074424200022, "train_eval/perplexity_len_1024": 304.3183703229844, "train_eval/loss_avg_len_512": 5.745852673280315, "train_eval/perplexity_len_512": 312.89030732338205} +{"step": 1153433600, "train_eval/train_token_count": 1153433600, "train_eval/train_batch_count": 550, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 6434.099346054019, "train_eval/train_update_time": 3715.6762271750486, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 5.655646991200555, "train_eval/perplexity_len_2048": 285.9013985897084, "train_eval/loss_avg_len_1024": 5.669753373692364, "train_eval/perplexity_len_1024": 289.96301306531365, "train_eval/loss_avg_len_512": 5.69629149336979, "train_eval/perplexity_len_512": 297.7611018557276} +{"step": 1258291200, "train_eval/train_token_count": 1258291200, "train_eval/train_batch_count": 600, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 6972.698619629024, "train_eval/train_update_time": 4053.2279500714503, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 5.61831548873808, "train_eval/perplexity_len_2048": 275.42503567083156, "train_eval/loss_avg_len_1024": 5.633299176973669, "train_eval/perplexity_len_1024": 279.58299150262565, "train_eval/loss_avg_len_512": 5.662484543039026, "train_eval/perplexity_len_512": 287.8629627292343} +{"step": 1363148800, "train_eval/train_token_count": 1363148800, "train_eval/train_batch_count": 650, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 7612.232262870064, "train_eval/train_update_time": 4390.719008370885, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 5.596533485618165, "train_eval/perplexity_len_2048": 269.49059333919547, "train_eval/loss_avg_len_1024": 5.6167390749404875, "train_eval/perplexity_len_1024": 274.9911938913255, "train_eval/loss_avg_len_512": 5.646642419948766, "train_eval/perplexity_len_512": 283.33853513529573} +{"step": 1468006400, "train_eval/train_token_count": 1468006400, "train_eval/train_batch_count": 700, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 8150.834610218997, "train_eval/train_update_time": 4728.229743778473, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 5.5670235770304135, "train_eval/perplexity_len_2048": 261.6541456709636, "train_eval/loss_avg_len_1024": 5.5871347326369865, "train_eval/perplexity_len_1024": 266.9695835538501, "train_eval/loss_avg_len_512": 5.621378581559547, "train_eval/perplexity_len_512": 276.26998153360597} +{"step": 1572864000, "train_eval/train_token_count": 1572864000, "train_eval/train_batch_count": 750, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 8789.583836077014, "train_eval/train_update_time": 5065.748505146126, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 5.54864313980328, "train_eval/perplexity_len_2048": 256.8887571941226, "train_eval/loss_avg_len_1024": 5.569592643196739, "train_eval/perplexity_len_1024": 262.3272166956895, "train_eval/loss_avg_len_512": 5.603579787398194, "train_eval/perplexity_len_512": 271.39621128174645} +{"step": 1677721600, "train_eval/train_token_count": 1677721600, "train_eval/train_batch_count": 800, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 9328.580250757048, "train_eval/train_update_time": 5403.266901573632, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 5.533965866914477, "train_eval/perplexity_len_2048": 253.1458657141699, "train_eval/loss_avg_len_1024": 5.5568862003808315, "train_eval/perplexity_len_1024": 259.0150583561873, "train_eval/loss_avg_len_512": 5.590751695272338, "train_eval/perplexity_len_512": 267.9369509736206} +{"step": 1782579200, "train_eval/train_token_count": 1782579200, "train_eval/train_batch_count": 850, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 9968.994457261055, "train_eval/train_update_time": 5740.817306284327, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 5.516171709689115, "train_eval/perplexity_len_2048": 248.68118876485585, "train_eval/loss_avg_len_1024": 5.532715844693376, "train_eval/perplexity_len_1024": 252.82962545170352, "train_eval/loss_avg_len_512": 5.567636160671682, "train_eval/perplexity_len_512": 261.8144798242977} +{"step": 1887436800, "train_eval/train_token_count": 1887436800, "train_eval/train_batch_count": 900, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 10507.649754095008, "train_eval/train_update_time": 6078.3277447193395, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 5.515744624709568, "train_eval/perplexity_len_2048": 248.5750034411309, "train_eval/loss_avg_len_1024": 5.536319641943482, "train_eval/perplexity_len_1024": 253.74241592866284, "train_eval/loss_avg_len_512": 5.571619798068714, "train_eval/perplexity_len_512": 262.8595339535372} +{"step": 1992294400, "train_eval/train_token_count": 1992294400, "train_eval/train_batch_count": 950, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 11871.571419561049, "train_eval/train_update_time": 6813.441040770616, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 5.506232261902841, "train_eval/perplexity_len_2048": 246.22167840950056, "train_eval/loss_avg_len_1024": 5.529569865919621, "train_eval/perplexity_len_1024": 252.03547865612154, "train_eval/loss_avg_len_512": 5.564332565377117, "train_eval/perplexity_len_512": 260.9509778565561} diff --git a/metrics/jsonlines/val.jsonl b/metrics/jsonlines/val.jsonl index bd3b22e0374ee33fc7ce22ffc58fea25fd7ee936..8364a29be5ccecec4d463575e446d74efbb6b602 100644 --- a/metrics/jsonlines/val.jsonl +++ b/metrics/jsonlines/val.jsonl @@ -1,49 +1,49 @@ -{"step": 41943040, "val/train_token_count": 41943040, "val/train_batch_count": 20, "val/train_flop_count": 0, "val/train_total_time": 115.56162341398885, "val/train_update_time": 115.25218190002488, "val/loss": 8.017322944736389, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.36243360501248, "val/val_tokens_per_second": 453285.711394651, "val/loss_avg_len_2048": 8.017322944736389, "val/perplexity_len_2048": 3033.046820927388, "val/loss_avg_len_1024": 8.016116743054521, "val/perplexity_len_1024": 3029.3905602879668, "val/loss_avg_len_512": 8.016581874255465, "val/perplexity_len_512": 3030.799952108046} -{"step": 83886080, "val/train_token_count": 83886080, "val/train_batch_count": 40, "val/train_flop_count": 0, "val/train_total_time": 317.6585605319706, "val/train_update_time": 226.74485654599266, "val/loss": 7.168872293418506, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.48027557897149, "val/val_tokens_per_second": 452695.34976438014, "val/loss_avg_len_2048": 7.168872293418506, "val/perplexity_len_2048": 1298.379585700498, "val/loss_avg_len_1024": 7.169298829473462, "val/perplexity_len_1024": 1298.933509532663, "val/loss_avg_len_512": 7.17260874950029, "val/perplexity_len_512": 1303.2399987050917} -{"step": 125829120, "val/train_token_count": 125829120, "val/train_batch_count": 60, "val/train_flop_count": 0, "val/train_total_time": 519.8498190940008, "val/train_update_time": 338.1969051870401, "val/loss": 6.680456670384901, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.21118094300618, "val/val_tokens_per_second": 454045.7133121647, "val/loss_avg_len_2048": 6.680456670384901, "val/perplexity_len_2048": 796.6828504192507, "val/loss_avg_len_1024": 6.681968356456887, "val/perplexity_len_1024": 797.8880955346282, "val/loss_avg_len_512": 6.6880630861138926, "val/perplexity_len_512": 802.7658569931743} -{"step": 167772160, "val/train_token_count": 167772160, "val/train_batch_count": 80, "val/train_flop_count": 0, "val/train_total_time": 721.7491259319941, "val/train_update_time": 449.6537483881111, "val/loss": 6.256492450360163, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.13445069699083, "val/val_tokens_per_second": 454432.236323236, "val/loss_avg_len_2048": 6.256492450360163, "val/perplexity_len_2048": 521.3869384996046, "val/loss_avg_len_1024": 6.25937858268139, "val/perplexity_len_1024": 522.8939037992483, "val/loss_avg_len_512": 6.268213871597686, "val/perplexity_len_512": 527.5342919101196} -{"step": 209715200, "val/train_token_count": 209715200, "val/train_batch_count": 100, "val/train_flop_count": 0, "val/train_total_time": 923.5610087590176, "val/train_update_time": 561.0979154942906, "val/loss": 5.9596897887737725, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.17932575498708, "val/val_tokens_per_second": 454206.1016433674, "val/loss_avg_len_2048": 5.9596897887737725, "val/perplexity_len_2048": 387.48990187397294, "val/loss_avg_len_1024": 5.963750460020918, "val/perplexity_len_1024": 389.0665699760066, "val/loss_avg_len_512": 5.9747771193729715, "val/perplexity_len_512": 393.38041444619915} -{"step": 251658240, "val/train_token_count": 251658240, "val/train_batch_count": 120, "val/train_flop_count": 0, "val/train_total_time": 1125.8911167309852, "val/train_update_time": 672.5490376223461, "val/loss": 5.729621500730747, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.24560184002621, "val/val_tokens_per_second": 453872.5341164848, "val/loss_avg_len_2048": 5.729621500730747, "val/perplexity_len_2048": 307.8527242916948, "val/loss_avg_len_1024": 5.73466736189276, "val/perplexity_len_1024": 309.4100320720618, "val/loss_avg_len_512": 5.747293829907757, "val/perplexity_len_512": 313.34155634560165} -{"step": 293601280, "val/train_token_count": 293601280, "val/train_batch_count": 140, "val/train_flop_count": 0, "val/train_total_time": 1327.8194525539875, "val/train_update_time": 784.0098267712165, "val/loss": 5.54191019657671, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.24336050899001, "val/val_tokens_per_second": 453883.80673079635, "val/loss_avg_len_2048": 5.54191019657671, "val/perplexity_len_2048": 255.1649494383086, "val/loss_avg_len_1024": 5.5479404277496975, "val/perplexity_len_1024": 256.70830177953565, "val/loss_avg_len_512": 5.5618576472472405, "val/perplexity_len_512": 260.3059440825885} -{"step": 335544320, "val/train_token_count": 335544320, "val/train_batch_count": 160, "val/train_flop_count": 0, "val/train_total_time": 1529.7566971820197, "val/train_update_time": 895.4798201125232, "val/loss": 5.395747513790498, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.28668313001981, "val/val_tokens_per_second": 453666.01784467406, "val/loss_avg_len_2048": 5.395747513790498, "val/perplexity_len_2048": 220.46688755473716, "val/loss_avg_len_1024": 5.40283216586914, "val/perplexity_len_1024": 222.03436470678773, "val/loss_avg_len_512": 5.417992734318786, "val/perplexity_len_512": 225.42617783355703} -{"step": 377487360, "val/train_token_count": 377487360, "val/train_batch_count": 180, "val/train_flop_count": 0, "val/train_total_time": 1731.7409806579817, "val/train_update_time": 1006.941321704362, "val/loss": 5.257520105597726, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.21052933402825, "val/val_tokens_per_second": 454048.99297658267, "val/loss_avg_len_2048": 5.257520105597726, "val/perplexity_len_2048": 192.0047489041577, "val/loss_avg_len_1024": 5.265500482419599, "val/perplexity_len_1024": 193.54314949562067, "val/loss_avg_len_512": 5.282038998350409, "val/perplexity_len_512": 196.77068168657516} -{"step": 419430400, "val/train_token_count": 419430400, "val/train_batch_count": 200, "val/train_flop_count": 0, "val/train_total_time": 1933.6392927800189, "val/train_update_time": 1118.3960400532233, "val/loss": 5.150704617314763, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.31206437497167, "val/val_tokens_per_second": 453538.51983646286, "val/loss_avg_len_2048": 5.150704617314763, "val/perplexity_len_2048": 172.55303134546992, "val/loss_avg_len_1024": 5.1593652144801805, "val/perplexity_len_1024": 174.0539336132167, "val/loss_avg_len_512": 5.177391785788723, "val/perplexity_len_512": 177.21998000419174} -{"step": 461373440, "val/train_token_count": 461373440, "val/train_batch_count": 220, "val/train_flop_count": 0, "val/train_total_time": 2136.1286090469803, "val/train_update_time": 1229.8719967252691, "val/loss": 5.0635993114376445, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.34318823303329, "val/val_tokens_per_second": 453382.27265509864, "val/loss_avg_len_2048": 5.0635993114376445, "val/perplexity_len_2048": 158.15875569300152, "val/loss_avg_len_1024": 5.0730407805304045, "val/perplexity_len_1024": 159.6590781757551, "val/loss_avg_len_512": 5.092240632939898, "val/perplexity_len_512": 162.75412606608745} -{"step": 503316480, "val/train_token_count": 503316480, "val/train_batch_count": 240, "val/train_flop_count": 0, "val/train_total_time": 2338.156839519972, "val/train_update_time": 1341.322897736216, "val/loss": 4.98549556239089, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.20560409803875, "val/val_tokens_per_second": 454073.78410196304, "val/loss_avg_len_2048": 4.98549556239089, "val/perplexity_len_2048": 146.2760459748089, "val/loss_avg_len_1024": 4.995756369349081, "val/perplexity_len_1024": 147.78468292514813, "val/loss_avg_len_512": 5.016161771441624, "val/perplexity_len_512": 150.8312664737655} -{"step": 545259520, "val/train_token_count": 545259520, "val/train_batch_count": 260, "val/train_flop_count": 0, "val/train_total_time": 2540.0900395850185, "val/train_update_time": 1452.8315300212707, "val/loss": 4.916477123672562, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.74230563105084, "val/val_tokens_per_second": 451388.1338494888, "val/loss_avg_len_2048": 4.916477123672562, "val/perplexity_len_2048": 136.5208190724984, "val/loss_avg_len_1024": 4.927128413101426, "val/perplexity_len_1024": 137.98271353908035, "val/loss_avg_len_512": 4.948208645739966, "val/perplexity_len_512": 140.92229592495846} -{"step": 587202560, "val/train_token_count": 587202560, "val/train_batch_count": 280, "val/train_flop_count": 0, "val/train_total_time": 2742.5622889249935, "val/train_update_time": 1564.3145829213317, "val/loss": 4.863091215804801, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.69826148403808, "val/val_tokens_per_second": 451607.3332586261, "val/loss_avg_len_2048": 4.863091215804801, "val/perplexity_len_2048": 129.42366084863215, "val/loss_avg_len_1024": 4.874493102245079, "val/perplexity_len_1024": 130.9077795303594, "val/loss_avg_len_512": 4.896728463353682, "val/perplexity_len_512": 133.85116361495074} -{"step": 629145600, "val/train_token_count": 629145600, "val/train_batch_count": 300, "val/train_flop_count": 0, "val/train_total_time": 2944.979424642981, "val/train_update_time": 1675.8085315313656, "val/loss": 4.811523659892753, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.33663047896698, "val/val_tokens_per_second": 453415.18476867134, "val/loss_avg_len_2048": 4.811523659892753, "val/perplexity_len_2048": 122.91876129597873, "val/loss_avg_len_1024": 4.8232065941833895, "val/perplexity_len_1024": 124.36323452041226, "val/loss_avg_len_512": 4.846166890252475, "val/perplexity_len_512": 127.2516841422241} -{"step": 671088640, "val/train_token_count": 671088640, "val/train_batch_count": 320, "val/train_flop_count": 0, "val/train_total_time": 3147.4849796229973, "val/train_update_time": 1787.300771905575, "val/loss": 4.760587245357363, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.22133365698392, "val/val_tokens_per_second": 453994.61900804366, "val/loss_avg_len_2048": 4.760587245357363, "val/perplexity_len_2048": 116.8145045362375, "val/loss_avg_len_1024": 4.77283736684951, "val/perplexity_len_1024": 118.25429722128945, "val/loss_avg_len_512": 4.796683278769628, "val/perplexity_len_512": 121.10806894510807} -{"step": 713031680, "val/train_token_count": 713031680, "val/train_batch_count": 340, "val/train_flop_count": 0, "val/train_total_time": 3349.424384585989, "val/train_update_time": 1898.7890577405924, "val/loss": 4.719228506370658, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.74322453403147, "val/val_tokens_per_second": 451383.562908752, "val/loss_avg_len_2048": 4.719228506370658, "val/perplexity_len_2048": 112.08174894828639, "val/loss_avg_len_1024": 4.73204894817751, "val/perplexity_len_1024": 113.52793706523403, "val/loss_avg_len_512": 4.756577379063424, "val/perplexity_len_512": 116.3470318696743} -{"step": 754974720, "val/train_token_count": 754974720, "val/train_batch_count": 360, "val/train_flop_count": 0, "val/train_total_time": 3551.9233703140053, "val/train_update_time": 2010.2989615525585, "val/loss": 4.676367494543736, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.61035839398392, "val/val_tokens_per_second": 452045.44740791514, "val/loss_avg_len_2048": 4.676367494543736, "val/perplexity_len_2048": 107.37930735283909, "val/loss_avg_len_1024": 4.689829182334012, "val/perplexity_len_1024": 108.8345873493113, "val/loss_avg_len_512": 4.7154578478252525, "val/perplexity_len_512": 111.65992272494637} -{"step": 796917760, "val/train_token_count": 796917760, "val/train_batch_count": 380, "val/train_flop_count": 0, "val/train_total_time": 3754.2459046120057, "val/train_update_time": 2121.7969342375873, "val/loss": 4.640193889026716, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 91.96643112896709, "val/val_tokens_per_second": 445379.90109196096, "val/loss_avg_len_2048": 4.640193889026716, "val/perplexity_len_2048": 103.56442564245071, "val/loss_avg_len_1024": 4.654089609145093, "val/perplexity_len_1024": 105.01357307089665, "val/loss_avg_len_512": 4.680419558078237, "val/perplexity_len_512": 107.81529786259468} -{"step": 838860800, "val/train_token_count": 838860800, "val/train_batch_count": 400, "val/train_flop_count": 0, "val/train_total_time": 3957.9235017200117, "val/train_update_time": 2233.291398033558, "val/loss": 4.608071265847772, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.61327432200778, "val/val_tokens_per_second": 452030.90062105615, "val/loss_avg_len_2048": 4.608071265847772, "val/perplexity_len_2048": 100.29052920641857, "val/loss_avg_len_1024": 4.622682944629249, "val/perplexity_len_1024": 101.76670061160816, "val/loss_avg_len_512": 4.650229472655617, "val/perplexity_len_512": 104.60898772530528} -{"step": 880803840, "val/train_token_count": 880803840, "val/train_batch_count": 420, "val/train_flop_count": 0, "val/train_total_time": 4160.708882857987, "val/train_update_time": 2344.7830602055765, "val/loss": 4.577349349257373, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.45394109800691, "val/val_tokens_per_second": 452827.1460899621, "val/loss_avg_len_2048": 4.577349349257373, "val/perplexity_len_2048": 97.25625986875657, "val/loss_avg_len_1024": 4.592617217212357, "val/perplexity_len_1024": 98.75254910922418, "val/loss_avg_len_512": 4.621059415361099, "val/perplexity_len_512": 101.60161344282939} -{"step": 922746880, "val/train_token_count": 922746880, "val/train_batch_count": 440, "val/train_flop_count": 0, "val/train_total_time": 4362.870210377965, "val/train_update_time": 2456.286583611334, "val/loss": 4.549797477854183, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.47601472598035, "val/val_tokens_per_second": 452716.66887686495, "val/loss_avg_len_2048": 4.549797477854183, "val/perplexity_len_2048": 94.61324509708179, "val/loss_avg_len_1024": 4.565505847024965, "val/perplexity_len_1024": 96.11119928630141, "val/loss_avg_len_512": 4.594841379802302, "val/perplexity_len_512": 98.97243527526068} -{"step": 964689920, "val/train_token_count": 964689920, "val/train_batch_count": 460, "val/train_flop_count": 0, "val/train_total_time": 4565.060955744993, "val/train_update_time": 2567.792047406314, "val/loss": 4.5204342533537885, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.54267534095561, "val/val_tokens_per_second": 452383.3633781789, "val/loss_avg_len_2048": 4.5204342533537885, "val/perplexity_len_2048": 91.87548655482323, "val/loss_avg_len_1024": 4.536794685186399, "val/perplexity_len_1024": 93.39097238779135, "val/loss_avg_len_512": 4.567255290885735, "val/perplexity_len_512": 96.27948759639831} -{"step": 1006632960, "val/train_token_count": 1006632960, "val/train_batch_count": 480, "val/train_flop_count": 0, "val/train_total_time": 4767.3171031199745, "val/train_update_time": 2679.296893617313, "val/loss": 4.492667575135734, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.70578284398653, "val/val_tokens_per_second": 451569.8857971491, "val/loss_avg_len_2048": 4.492667575135734, "val/perplexity_len_2048": 89.35950140608207, "val/loss_avg_len_1024": 4.509617001681402, "val/perplexity_len_1024": 90.88700227462165, "val/loss_avg_len_512": 4.5411422349753785, "val/perplexity_len_512": 93.7978781707472} -{"step": 1048576000, "val/train_token_count": 1048576000, "val/train_batch_count": 500, "val/train_flop_count": 0, "val/train_total_time": 4969.727964510967, "val/train_update_time": 2790.797476610227, "val/loss": 4.4686831552135295, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.87361066404264, "val/val_tokens_per_second": 450735.9144276554, "val/loss_avg_len_2048": 4.4686831552135295, "val/perplexity_len_2048": 87.24176347672332, "val/loss_avg_len_1024": 4.486252108311607, "val/perplexity_len_1024": 88.78805350190629, "val/loss_avg_len_512": 4.51881691169506, "val/perplexity_len_512": 91.72701260192157} -{"step": 1090519040, "val/train_token_count": 1090519040, "val/train_batch_count": 520, "val/train_flop_count": 0, "val/train_total_time": 5172.765950003988, "val/train_update_time": 2902.285505968146, "val/loss": 4.4460520937834875, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.76087000104599, "val/val_tokens_per_second": 451295.80621613644, "val/loss_avg_len_2048": 4.4460520937834875, "val/perplexity_len_2048": 85.28956326961926, "val/loss_avg_len_1024": 4.464425405966584, "val/perplexity_len_1024": 86.87109958090244, "val/loss_avg_len_512": 4.498216118935217, "val/perplexity_len_512": 89.85669458704189} -{"step": 1132462080, "val/train_token_count": 1132462080, "val/train_batch_count": 540, "val/train_flop_count": 0, "val/train_total_time": 5375.2188087760005, "val/train_update_time": 3013.786068893096, "val/loss": 4.420050854198402, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.47328536800342, "val/val_tokens_per_second": 452730.3262326961, "val/loss_avg_len_2048": 4.420050854198402, "val/perplexity_len_2048": 83.10051126077735, "val/loss_avg_len_1024": 4.439237378784268, "val/perplexity_len_1024": 84.71031515065614, "val/loss_avg_len_512": 4.474183511526417, "val/perplexity_len_512": 87.7229463868} -{"step": 1174405120, "val/train_token_count": 1174405120, "val/train_batch_count": 560, "val/train_flop_count": 0, "val/train_total_time": 5577.396705480001, "val/train_update_time": 3125.264993761957, "val/loss": 4.398202258219151, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.4617071620305, "val/val_tokens_per_second": 452788.27124757314, "val/loss_avg_len_2048": 4.398202258219151, "val/perplexity_len_2048": 81.30457257597935, "val/loss_avg_len_1024": 4.418109852228035, "val/perplexity_len_1024": 82.93936944356574, "val/loss_avg_len_512": 4.454481553460378, "val/perplexity_len_512": 86.01154689490046} -{"step": 1216348160, "val/train_token_count": 1216348160, "val/train_batch_count": 580, "val/train_flop_count": 0, "val/train_total_time": 5779.562335913011, "val/train_update_time": 3236.749040101946, "val/loss": 4.376139390771115, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.2175632059807, "val/val_tokens_per_second": 454013.5927467024, "val/loss_avg_len_2048": 4.376139390771115, "val/perplexity_len_2048": 79.53040415674566, "val/loss_avg_len_1024": 4.396944615813718, "val/perplexity_len_1024": 81.2023847690807, "val/loss_avg_len_512": 4.435031853418239, "val/perplexity_len_512": 84.35481183459801} -{"step": 1258291200, "val/train_token_count": 1258291200, "val/train_batch_count": 600, "val/train_flop_count": 0, "val/train_total_time": 5981.512405745976, "val/train_update_time": 3348.2465934828506, "val/loss": 4.355563867319981, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.72774724900955, "val/val_tokens_per_second": 451460.5646228822, "val/loss_avg_len_2048": 4.355563867319981, "val/perplexity_len_2048": 77.9107442760098, "val/loss_avg_len_1024": 4.377231672070688, "val/perplexity_len_1024": 79.61732119023712, "val/loss_avg_len_512": 4.4167310255174534, "val/perplexity_len_512": 82.82508923002905} -{"step": 1300234240, "val/train_token_count": 1300234240, "val/train_batch_count": 620, "val/train_flop_count": 0, "val/train_total_time": 6184.421642497007, "val/train_update_time": 3459.7488345169113, "val/loss": 4.335396474500792, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.66217868198873, "val/val_tokens_per_second": 451787.0692659326, "val/loss_avg_len_2048": 4.335396474500792, "val/perplexity_len_2048": 76.35522578937196, "val/loss_avg_len_1024": 4.358097750052391, "val/perplexity_len_1024": 78.10841129235861, "val/loss_avg_len_512": 4.399293633644097, "val/perplexity_len_512": 81.3933548269956} -{"step": 1342177280, "val/train_token_count": 1342177280, "val/train_batch_count": 640, "val/train_flop_count": 0, "val/train_total_time": 6386.8215373010025, "val/train_update_time": 3571.2574781817966, "val/loss": 4.31625511668981, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.28017372795148, "val/val_tokens_per_second": 453698.72817733017, "val/loss_avg_len_2048": 4.31625511668981, "val/perplexity_len_2048": 74.90758222363428, "val/loss_avg_len_1024": 4.339928405715012, "val/perplexity_len_1024": 76.70204771345033, "val/loss_avg_len_512": 4.382818944332655, "val/perplexity_len_512": 80.06340988951868} -{"step": 1384120320, "val/train_token_count": 1384120320, "val/train_batch_count": 660, "val/train_flop_count": 0, "val/train_total_time": 6588.8636812510085, "val/train_update_time": 3682.7761907348176, "val/loss": 4.2996819401991555, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.226537807961, "val/val_tokens_per_second": 453968.4331807084, "val/loss_avg_len_2048": 4.2996819401991555, "val/perplexity_len_2048": 73.67635648530485, "val/loss_avg_len_1024": 4.324086923091021, "val/perplexity_len_1024": 75.49654722507492, "val/loss_avg_len_512": 4.368286226595659, "val/perplexity_len_512": 78.90828483582929} -{"step": 1426063360, "val/train_token_count": 1426063360, "val/train_batch_count": 680, "val/train_flop_count": 0, "val/train_total_time": 6790.794109267998, "val/train_update_time": 3794.261904676736, "val/loss": 4.283236857734924, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.13755302602658, "val/val_tokens_per_second": 454416.59580189723, "val/loss_avg_len_2048": 4.283236857734924, "val/perplexity_len_2048": 72.47465088349, "val/loss_avg_len_1024": 4.308458720062673, "val/perplexity_len_1024": 74.32584368113129, "val/loss_avg_len_512": 4.3540999811033725, "val/perplexity_len_512": 77.7967752505338} -{"step": 1468006400, "val/train_token_count": 1468006400, "val/train_batch_count": 700, "val/train_flop_count": 0, "val/train_total_time": 6992.639220207988, "val/train_update_time": 3905.7538149688044, "val/loss": 4.267898958559893, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.21615546499379, "val/val_tokens_per_second": 454020.67721555196, "val/loss_avg_len_2048": 4.267898958559893, "val/perplexity_len_2048": 71.371523450084, "val/loss_avg_len_1024": 4.29405301307696, "val/perplexity_len_1024": 73.26280266811683, "val/loss_avg_len_512": 4.3412228272167965, "val/perplexity_len_512": 76.80139677915308} -{"step": 1509949440, "val/train_token_count": 1509949440, "val/train_batch_count": 720, "val/train_flop_count": 0, "val/train_total_time": 7195.047693797969, "val/train_update_time": 4017.255656591733, "val/loss": 4.25385695036254, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.40624964801827, "val/val_tokens_per_second": 453066.0231949778, "val/loss_avg_len_2048": 4.25385695036254, "val/perplexity_len_2048": 70.3763275596729, "val/loss_avg_len_1024": 4.280959435730102, "val/perplexity_len_1024": 72.30978332653642, "val/loss_avg_len_512": 4.3297005797375, "val/perplexity_len_512": 75.92155071492496} -{"step": 1551892480, "val/train_token_count": 1551892480, "val/train_batch_count": 740, "val/train_flop_count": 0, "val/train_total_time": 7397.135954166006, "val/train_update_time": 4128.71436746855, "val/loss": 4.241492192271679, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.44096185202943, "val/val_tokens_per_second": 452892.13163184514, "val/loss_avg_len_2048": 4.241492192271679, "val/perplexity_len_2048": 69.51149901038498, "val/loss_avg_len_1024": 4.26916799461185, "val/perplexity_len_1024": 71.46215398096777, "val/loss_avg_len_512": 4.318984106020722, "val/perplexity_len_512": 75.11228340295357} -{"step": 1593835520, "val/train_token_count": 1593835520, "val/train_batch_count": 760, "val/train_flop_count": 0, "val/train_total_time": 7599.255373338994, "val/train_update_time": 4240.1828456086805, "val/loss": 4.230574601543066, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.44047201605281, "val/val_tokens_per_second": 452894.58454761014, "val/loss_avg_len_2048": 4.230574601543066, "val/perplexity_len_2048": 68.75672854774076, "val/loss_avg_len_1024": 4.259278581639892, "val/perplexity_len_1024": 70.75891825403492, "val/loss_avg_len_512": 4.310739070640505, "val/perplexity_len_512": 74.49552605583897} -{"step": 1635778560, "val/train_token_count": 1635778560, "val/train_batch_count": 780, "val/train_flop_count": 0, "val/train_total_time": 7801.41744280397, "val/train_update_time": 4351.678691691719, "val/loss": 4.220437906114012, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.25361566705396, "val/val_tokens_per_second": 453832.23372570076, "val/loss_avg_len_2048": 4.220437906114012, "val/perplexity_len_2048": 68.06328309221031, "val/loss_avg_len_1024": 4.2494930887183635, "val/perplexity_len_1024": 70.0698841278538, "val/loss_avg_len_512": 4.301655115112848, "val/perplexity_len_512": 73.82187634450761} -{"step": 1677721600, "val/train_token_count": 1677721600, "val/train_batch_count": 800, "val/train_flop_count": 0, "val/train_total_time": 8003.3749305050005, "val/train_update_time": 4463.172267011658, "val/loss": 4.212082446529414, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.65977515699342, "val/val_tokens_per_second": 451799.04681067786, "val/loss_avg_len_2048": 4.212082446529414, "val/perplexity_len_2048": 67.49695235274098, "val/loss_avg_len_1024": 4.241754451114219, "val/perplexity_len_1024": 69.52973140635773, "val/loss_avg_len_512": 4.295026430321672, "val/perplexity_len_512": 73.33415266465448} -{"step": 1719664640, "val/train_token_count": 1719664640, "val/train_batch_count": 820, "val/train_flop_count": 0, "val/train_total_time": 8206.275687849964, "val/train_update_time": 4574.681749307667, "val/loss": 4.20452369724242, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.39738410396967, "val/val_tokens_per_second": 453110.45674607414, "val/loss_avg_len_2048": 4.20452369724242, "val/perplexity_len_2048": 66.9886831719058, "val/loss_avg_len_1024": 4.23449458040651, "val/perplexity_len_1024": 69.02678242730825, "val/loss_avg_len_512": 4.288312720157765, "val/perplexity_len_512": 72.84345745438628} -{"step": 1761607680, "val/train_token_count": 1761607680, "val/train_batch_count": 840, "val/train_flop_count": 0, "val/train_total_time": 8408.397738418018, "val/train_update_time": 4686.193155970657, "val/loss": 4.198343608177501, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.30731243803166, "val/val_tokens_per_second": 453562.3848634241, "val/loss_avg_len_2048": 4.198343608177501, "val/perplexity_len_2048": 66.57596377846592, "val/loss_avg_len_1024": 4.228689314186201, "val/perplexity_len_1024": 68.62722447126744, "val/loss_avg_len_512": 4.283119718784839, "val/perplexity_len_512": 72.46616177619002} -{"step": 1803550720, "val/train_token_count": 1803550720, "val/train_batch_count": 860, "val/train_flop_count": 0, "val/train_total_time": 8610.412180389976, "val/train_update_time": 4797.692331016588, "val/loss": 4.193367760937754, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.37212692195317, "val/val_tokens_per_second": 453237.0919561705, "val/loss_avg_len_2048": 4.193367760937754, "val/perplexity_len_2048": 66.24551476656741, "val/loss_avg_len_1024": 4.224034787101439, "val/perplexity_len_1024": 68.30853943562786, "val/loss_avg_len_512": 4.2790641182546505, "val/perplexity_len_512": 72.17286312516411} -{"step": 1845493760, "val/train_token_count": 1845493760, "val/train_batch_count": 880, "val/train_flop_count": 0, "val/train_total_time": 8812.488870778994, "val/train_update_time": 4909.191867132671, "val/loss": 4.189421963141696, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.32052933302475, "val/val_tokens_per_second": 453496.01361363375, "val/loss_avg_len_2048": 4.189421963141696, "val/perplexity_len_2048": 65.98463838160738, "val/loss_avg_len_1024": 4.22020412462051, "val/perplexity_len_1024": 68.04737301604953, "val/loss_avg_len_512": 4.2755115066579545, "val/perplexity_len_512": 71.9169158844202} -{"step": 1887436800, "val/train_token_count": 1887436800, "val/train_batch_count": 900, "val/train_flop_count": 0, "val/train_total_time": 9014.539005896018, "val/train_update_time": 5020.712582220614, "val/loss": 4.186491362652555, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.58312447200296, "val/val_tokens_per_second": 452181.35539870604, "val/loss_avg_len_2048": 4.186491362652555, "val/perplexity_len_2048": 65.79154684336461, "val/loss_avg_len_1024": 4.217514740810637, "val/perplexity_len_1024": 67.86461337831629, "val/loss_avg_len_512": 4.273214724269416, "val/perplexity_len_512": 71.75192792183111} -{"step": 1929379840, "val/train_token_count": 1929379840, "val/train_batch_count": 920, "val/train_flop_count": 0, "val/train_total_time": 9217.298647498013, "val/train_update_time": 5132.212492840539, "val/loss": 4.184350719433324, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.41417535103392, "val/val_tokens_per_second": 453026.30744540226, "val/loss_avg_len_2048": 4.184350719433324, "val/perplexity_len_2048": 65.65086124728785, "val/loss_avg_len_1024": 4.215543735674023, "val/perplexity_len_1024": 67.73098361249181, "val/loss_avg_len_512": 4.271550920667592, "val/perplexity_len_512": 71.63264606402603} -{"step": 1971322880, "val/train_token_count": 1971322880, "val/train_batch_count": 940, "val/train_flop_count": 0, "val/train_total_time": 9419.395047847007, "val/train_update_time": 5243.674068749533, "val/loss": 4.183023557277815, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.3052227080334, "val/val_tokens_per_second": 453572.88063424785, "val/loss_avg_len_2048": 4.183023557277815, "val/perplexity_len_2048": 65.56378970057509, "val/loss_avg_len_1024": 4.214286808201578, "val/perplexity_len_1024": 67.64590415900594, "val/loss_avg_len_512": 4.270415190260951, "val/perplexity_len_512": 71.5513368711842} -{"step": 2013265920, "val/train_token_count": 2013265920, "val/train_batch_count": 960, "val/train_flop_count": 0, "val/train_total_time": 9621.420487532974, "val/train_update_time": 5355.161261588568, "val/loss": 4.18224084139755, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.24478351400467, "val/val_tokens_per_second": 453876.649763846, "val/loss_avg_len_2048": 4.18224084139755, "val/perplexity_len_2048": 65.51249195960324, "val/loss_avg_len_1024": 4.213500716115302, "val/perplexity_len_1024": 67.59274914418302, "val/loss_avg_len_512": 4.269666781060863, "val/perplexity_len_512": 71.4978072259293} -{"step": 2055208960, "val/train_token_count": 2055208960, "val/train_batch_count": 980, "val/train_flop_count": 0, "val/train_total_time": 9823.39123856998, "val/train_update_time": 5466.646158660587, "val/loss": 4.1819093990348515, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 90.3263800109853, "val/val_tokens_per_second": 453466.6394802773, "val/loss_avg_len_2048": 4.1819093990348515, "val/perplexity_len_2048": 65.49078194249032, "val/loss_avg_len_1024": 4.213224817466876, "val/perplexity_len_1024": 67.57410296839636, "val/loss_avg_len_512": 4.269427753666788, "val/perplexity_len_512": 71.48071933370453} +{"step": 41943040, "val/train_token_count": 41943040, "val/train_batch_count": 20, "val/train_flop_count": 0, "val/train_total_time": 137.6325364280492, "val/train_update_time": 137.34443728171755, "val/loss": 8.097969826221465, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 100.75021567591466, "val/val_tokens_per_second": 406549.99818319886, "val/loss_avg_len_2048": 8.097969826221465, "val/perplexity_len_2048": 3287.7865172400466, "val/loss_avg_len_1024": 8.100544073104858, "val/perplexity_len_1024": 3296.26099445374, "val/loss_avg_len_512": 8.100835843849183, "val/perplexity_len_512": 3297.22288729685} +{"step": 83886080, "val/train_token_count": 83886080, "val/train_batch_count": 40, "val/train_flop_count": 0, "val/train_total_time": 373.6941461900715, "val/train_update_time": 272.43619964295067, "val/loss": 7.509516383218766, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 100.37394046399277, "val/val_tokens_per_second": 408074.04601888295, "val/loss_avg_len_2048": 7.509516383218766, "val/perplexity_len_2048": 1825.3305688289142, "val/loss_avg_len_1024": 7.512931452941896, "val/perplexity_len_1024": 1831.5748562589274, "val/loss_avg_len_512": 7.514602960872651, "val/perplexity_len_512": 1834.638908237129} +{"step": 125829120, "val/train_token_count": 125829120, "val/train_batch_count": 60, "val/train_flop_count": 0, "val/train_total_time": 609.3696456589969, "val/train_update_time": 407.4915405898355, "val/loss": 7.194160730051995, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 99.84995853202417, "val/val_tokens_per_second": 410215.49334808375, "val/loss_avg_len_2048": 7.194160730051995, "val/perplexity_len_2048": 1331.6322575366655, "val/loss_avg_len_1024": 7.199122499632836, "val/perplexity_len_1024": 1338.255928941527, "val/loss_avg_len_512": 7.204208867883683, "val/perplexity_len_512": 1345.0801319021539} +{"step": 167772160, "val/train_token_count": 167772160, "val/train_batch_count": 80, "val/train_flop_count": 0, "val/train_total_time": 844.4406513640424, "val/train_update_time": 542.4893449847586, "val/loss": 6.966771374350786, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 100.66927708103321, "val/val_tokens_per_second": 406876.86638525734, "val/loss_avg_len_2048": 6.966771374350786, "val/perplexity_len_2048": 1060.7923144461163, "val/loss_avg_len_1024": 6.972864169692993, "val/perplexity_len_1024": 1067.2752344159362, "val/loss_avg_len_512": 6.979901323080063, "val/perplexity_len_512": 1074.8123025952734} +{"step": 209715200, "val/train_token_count": 209715200, "val/train_batch_count": 100, "val/train_flop_count": 0, "val/train_total_time": 1080.3595280270092, "val/train_update_time": 677.5131666237721, "val/loss": 6.773546469050646, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 100.50302975601517, "val/val_tokens_per_second": 407549.90271871403, "val/loss_avg_len_2048": 6.773546469050646, "val/perplexity_len_2048": 874.407460577292, "val/loss_avg_len_1024": 6.781000710988045, "val/perplexity_len_1024": 880.9498593575831, "val/loss_avg_len_512": 6.790085806655885, "val/perplexity_len_512": 888.9898398032801} +{"step": 251658240, "val/train_token_count": 251658240, "val/train_batch_count": 120, "val/train_flop_count": 0, "val/train_total_time": 1316.5735572939739, "val/train_update_time": 812.5169301189017, "val/loss": 6.612147194027901, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 100.81014677300118, "val/val_tokens_per_second": 406308.30636752775, "val/loss_avg_len_2048": 6.612147194027901, "val/perplexity_len_2048": 744.0789866334422, "val/loss_avg_len_1024": 6.620804714360833, "val/perplexity_len_1024": 750.5488315856857, "val/loss_avg_len_512": 6.631752208733559, "val/perplexity_len_512": 758.8106010204962} +{"step": 293601280, "val/train_token_count": 293601280, "val/train_batch_count": 140, "val/train_flop_count": 0, "val/train_total_time": 1552.605858018971, "val/train_update_time": 947.4958640788682, "val/loss": 6.478030627672375, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 100.42867044196464, "val/val_tokens_per_second": 407851.6604844412, "val/loss_avg_len_2048": 6.478030627672375, "val/perplexity_len_2048": 650.6882362125918, "val/loss_avg_len_1024": 6.487826892974973, "val/perplexity_len_1024": 657.0938752461975, "val/loss_avg_len_512": 6.500485957857967, "val/perplexity_len_512": 665.4649423985985} +{"step": 335544320, "val/train_token_count": 335544320, "val/train_batch_count": 160, "val/train_flop_count": 0, "val/train_total_time": 1788.244720379007, "val/train_update_time": 1082.4689192509977, "val/loss": 6.371274175237306, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 100.33702367299702, "val/val_tokens_per_second": 408224.18784805224, "val/loss_avg_len_2048": 6.371274175237306, "val/perplexity_len_2048": 584.8024952309793, "val/loss_avg_len_1024": 6.382026187753306, "val/perplexity_len_1024": 591.1242237288146, "val/loss_avg_len_512": 6.395917515824736, "val/perplexity_len_512": 599.3930235893379} +{"step": 377487360, "val/train_token_count": 377487360, "val/train_batch_count": 180, "val/train_flop_count": 0, "val/train_total_time": 2023.8116153230658, "val/train_update_time": 1217.4519352857023, "val/loss": 6.273945900231414, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 100.3569782380946, "val/val_tokens_per_second": 408143.0182445644, "val/loss_avg_len_2048": 6.273945900231414, "val/perplexity_len_2048": 530.5668165325196, "val/loss_avg_len_1024": 6.285613833794371, "val/perplexity_len_1024": 536.7936916369549, "val/loss_avg_len_512": 6.301188182660939, "val/perplexity_len_512": 545.2193455867996} +{"step": 419430400, "val/train_token_count": 419430400, "val/train_batch_count": 200, "val/train_flop_count": 0, "val/train_total_time": 2259.3751640570117, "val/train_update_time": 1352.4327959185466, "val/loss": 6.199569064133987, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 100.38574551208876, "val/val_tokens_per_second": 408026.0577939073, "val/loss_avg_len_2048": 6.199569064133987, "val/perplexity_len_2048": 492.53674360509217, "val/loss_avg_len_1024": 6.211747290783562, "val/perplexity_len_1024": 498.57164028604257, "val/loss_avg_len_512": 6.228355393150077, "val/perplexity_len_512": 506.92111164444407} +{"step": 461373440, "val/train_token_count": 461373440, "val/train_batch_count": 220, "val/train_flop_count": 0, "val/train_total_time": 2495.436698611011, "val/train_update_time": 1487.4421038717264, "val/loss": 6.132018646597304, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 100.70304475398734, "val/val_tokens_per_second": 406740.43272537884, "val/loss_avg_len_2048": 6.132018646597304, "val/perplexity_len_2048": 460.3645366385984, "val/loss_avg_len_1024": 6.144993824179471, "val/perplexity_len_1024": 466.37676879497087, "val/loss_avg_len_512": 6.16288100191094, "val/perplexity_len_512": 474.7939886785471} +{"step": 503316480, "val/train_token_count": 503316480, "val/train_batch_count": 240, "val/train_flop_count": 0, "val/train_total_time": 2731.4342436430743, "val/train_update_time": 1622.4905407206388, "val/loss": 6.0769569249046045, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 100.99657162395306, "val/val_tokens_per_second": 405558.3208557709, "val/loss_avg_len_2048": 6.0769569249046045, "val/perplexity_len_2048": 435.7013035210223, "val/loss_avg_len_1024": 6.090529228214921, "val/perplexity_len_1024": 441.65508563953284, "val/loss_avg_len_512": 6.1094287188325085, "val/perplexity_len_512": 450.081518613422} +{"step": 545259520, "val/train_token_count": 545259520, "val/train_batch_count": 260, "val/train_flop_count": 0, "val/train_total_time": 2967.7298853070242, "val/train_update_time": 1757.5320481728995, "val/loss": 6.024966034000274, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 100.81495576910675, "val/val_tokens_per_second": 406288.92496674176, "val/loss_avg_len_2048": 6.024966034000274, "val/perplexity_len_2048": 413.6275925097331, "val/loss_avg_len_1024": 6.039175203947909, "val/perplexity_len_1024": 419.54685155414825, "val/loss_avg_len_512": 6.058873298077286, "val/perplexity_len_512": 427.89305723099136} +{"step": 587202560, "val/train_token_count": 587202560, "val/train_batch_count": 280, "val/train_flop_count": 0, "val/train_total_time": 3203.840811592061, "val/train_update_time": 1892.5856667858316, "val/loss": 5.973690415629559, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 100.20035982609261, "val/val_tokens_per_second": 408780.96716508834, "val/loss_avg_len_2048": 5.973690415629559, "val/perplexity_len_2048": 392.9531586695076, "val/loss_avg_len_1024": 5.9884474193267065, "val/perplexity_len_1024": 398.7949676698734, "val/loss_avg_len_512": 6.00929208433833, "val/perplexity_len_512": 407.19495852955055} +{"step": 629145600, "val/train_token_count": 629145600, "val/train_batch_count": 300, "val/train_flop_count": 0, "val/train_total_time": 3439.312252484029, "val/train_update_time": 2027.637110557058, "val/loss": 5.9365289360866305, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 100.62716725701466, "val/val_tokens_per_second": 407047.13365708606, "val/loss_avg_len_2048": 5.9365289360866305, "val/perplexity_len_2048": 378.6184372510177, "val/loss_avg_len_1024": 5.951778036109708, "val/perplexity_len_1024": 384.4362733161468, "val/loss_avg_len_512": 5.973547525303998, "val/perplexity_len_512": 392.89701347613305} +{"step": 671088640, "val/train_token_count": 671088640, "val/train_batch_count": 320, "val/train_flop_count": 0, "val/train_total_time": 3675.6837699849857, "val/train_update_time": 2162.6913318177685, "val/loss": 5.899252319797839, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 100.47976837505121, "val/val_tokens_per_second": 407644.2517971631, "val/loss_avg_len_2048": 5.899252319797839, "val/perplexity_len_2048": 364.76463858479445, "val/loss_avg_len_1024": 5.914946995119518, "val/perplexity_len_1024": 370.53466203833915, "val/loss_avg_len_512": 5.937802961644158, "val/perplexity_len_512": 379.10111422273775} +{"step": 713031680, "val/train_token_count": 713031680, "val/train_batch_count": 340, "val/train_flop_count": 0, "val/train_total_time": 3911.4375108770328, "val/train_update_time": 2297.7100352901034, "val/loss": 5.866309138357454, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 100.25587885000277, "val/val_tokens_per_second": 408554.5952001683, "val/loss_avg_len_2048": 5.866309138357454, "val/perplexity_len_2048": 352.9439062000189, "val/loss_avg_len_1024": 5.882450300783432, "val/perplexity_len_1024": 358.6870569906692, "val/loss_avg_len_512": 5.905991363677662, "val/perplexity_len_512": 367.2311049681368} +{"step": 754974720, "val/train_token_count": 754974720, "val/train_batch_count": 360, "val/train_flop_count": 0, "val/train_total_time": 4147.001440979075, "val/train_update_time": 2432.7737647151807, "val/loss": 5.83839783823652, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 100.45364938897546, "val/val_tokens_per_second": 407750.24351176294, "val/loss_avg_len_2048": 5.83839783823652, "val/perplexity_len_2048": 343.2289915421645, "val/loss_avg_len_1024": 5.855042658005282, "val/perplexity_len_1024": 348.9897869446356, "val/loss_avg_len_512": 5.879319992480893, "val/perplexity_len_512": 357.56601144266216} +{"step": 796917760, "val/train_token_count": 796917760, "val/train_batch_count": 380, "val/train_flop_count": 0, "val/train_total_time": 4382.723289367976, "val/train_update_time": 2567.801604798413, "val/loss": 5.807034053320264, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 100.15528536809143, "val/val_tokens_per_second": 408964.9372917616, "val/loss_avg_len_2048": 5.807034053320264, "val/perplexity_len_2048": 332.6310950837973, "val/loss_avg_len_1024": 5.8241930332270915, "val/perplexity_len_1024": 338.38795503388445, "val/loss_avg_len_512": 5.849660384528758, "val/perplexity_len_512": 347.11647433348577} +{"step": 838860800, "val/train_token_count": 838860800, "val/train_batch_count": 400, "val/train_flop_count": 0, "val/train_total_time": 4618.154446462984, "val/train_update_time": 2702.834149704431, "val/loss": 5.7824100695554, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 100.07526794599835, "val/val_tokens_per_second": 409291.9343678644, "val/loss_avg_len_2048": 5.7824100695554, "val/perplexity_len_2048": 324.5404136083224, "val/loss_avg_len_1024": 5.799937409455178, "val/perplexity_len_1024": 330.27888692721285, "val/loss_avg_len_512": 5.826135484898323, "val/perplexity_len_512": 339.0458960851378} +{"step": 880803840, "val/train_token_count": 880803840, "val/train_batch_count": 420, "val/train_flop_count": 0, "val/train_total_time": 4853.921031603008, "val/train_update_time": 2837.8658607284306, "val/loss": 5.758844941980299, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 99.82154056197032, "val/val_tokens_per_second": 410332.2766750086, "val/loss_avg_len_2048": 5.758844941980299, "val/perplexity_len_2048": 316.9819848005806, "val/loss_avg_len_1024": 5.776860027853982, "val/perplexity_len_1024": 322.7441899478597, "val/loss_avg_len_512": 5.803712826554431, "val/perplexity_len_512": 331.5281843098571} +{"step": 922746880, "val/train_token_count": 922746880, "val/train_batch_count": 440, "val/train_flop_count": 0, "val/train_total_time": 5089.012363151065, "val/train_update_time": 2972.8906648436096, "val/loss": 5.737493131028349, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 100.32674534106627, "val/val_tokens_per_second": 408266.0098337112, "val/loss_avg_len_2048": 5.737493131028349, "val/perplexity_len_2048": 310.2855898704255, "val/loss_avg_len_1024": 5.755842571069266, "val/perplexity_len_1024": 316.0317145557937, "val/loss_avg_len_512": 5.7835284965001055, "val/perplexity_len_512": 324.90359140760836} +{"step": 964689920, "val/train_token_count": 964689920, "val/train_batch_count": 460, "val/train_flop_count": 0, "val/train_total_time": 5324.629123619059, "val/train_update_time": 3107.9427982217167, "val/loss": 5.716309683084255, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 99.80655148799997, "val/val_tokens_per_second": 410393.9008946195, "val/loss_avg_len_2048": 5.716309683084255, "val/perplexity_len_2048": 303.78180077501713, "val/loss_avg_len_1024": 5.734943081327493, "val/perplexity_len_1024": 309.4953541931483, "val/loss_avg_len_512": 5.763196059111692, "val/perplexity_len_512": 318.3642154880775} +{"step": 1006632960, "val/train_token_count": 1006632960, "val/train_batch_count": 480, "val/train_flop_count": 0, "val/train_total_time": 5559.744059987017, "val/train_update_time": 3242.99941329984, "val/loss": 5.700755887414662, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 100.3600756629603, "val/val_tokens_per_second": 408130.42167839885, "val/loss_avg_len_2048": 5.700755887414662, "val/perplexity_len_2048": 299.0933964769242, "val/loss_avg_len_1024": 5.719829780167866, "val/perplexity_len_1024": 304.8530265091337, "val/loss_avg_len_512": 5.7488650611438565, "val/perplexity_len_512": 313.8342753728958} +{"step": 1048576000, "val/train_token_count": 1048576000, "val/train_batch_count": 500, "val/train_flop_count": 0, "val/train_total_time": 5795.387424343033, "val/train_update_time": 3378.042265718919, "val/loss": 5.678921429768181, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 100.28977326198947, "val/val_tokens_per_second": 408416.5181329025, "val/loss_avg_len_2048": 5.678921429768181, "val/perplexity_len_2048": 292.633633722812, "val/loss_avg_len_1024": 5.698480519391351, "val/perplexity_len_1024": 298.4136225877653, "val/loss_avg_len_512": 5.728350201375177, "val/perplexity_len_512": 307.4615999924049} +{"step": 1090519040, "val/train_token_count": 1090519040, "val/train_batch_count": 520, "val/train_flop_count": 0, "val/train_total_time": 6031.410010787076, "val/train_update_time": 3513.0946629439713, "val/loss": 5.662745923463191, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 99.90603236400057, "val/val_tokens_per_second": 409985.2534506138, "val/loss_avg_len_2048": 5.662745923463191, "val/perplexity_len_2048": 287.9382143067501, "val/loss_avg_len_1024": 5.68280623494921, "val/perplexity_len_1024": 293.7726694126168, "val/loss_avg_len_512": 5.713412875633244, "val/perplexity_len_512": 302.90307675268207} +{"step": 1132462080, "val/train_token_count": 1132462080, "val/train_batch_count": 540, "val/train_flop_count": 0, "val/train_total_time": 6266.654250354972, "val/train_update_time": 3648.1618733102223, "val/loss": 5.650785900356039, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 99.81599653395824, "val/val_tokens_per_second": 410355.06754736515, "val/loss_avg_len_2048": 5.650785900356039, "val/perplexity_len_2048": 284.5149784059353, "val/loss_avg_len_1024": 5.6710522111513475, "val/perplexity_len_1024": 290.339872575047, "val/loss_avg_len_512": 5.7019783896925045, "val/perplexity_len_512": 299.45926242581703} +{"step": 1174405120, "val/train_token_count": 1174405120, "val/train_batch_count": 560, "val/train_flop_count": 0, "val/train_total_time": 6501.729716927046, "val/train_update_time": 3783.1945504179457, "val/loss": 5.63273147483218, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 100.2614537080517, "val/val_tokens_per_second": 408531.8782557271, "val/loss_avg_len_2048": 5.63273147483218, "val/perplexity_len_2048": 279.4243166839047, "val/loss_avg_len_1024": 5.653354023430002, "val/perplexity_len_1024": 285.24658691511917, "val/loss_avg_len_512": 5.684925929842354, "val/perplexity_len_512": 294.3960382822156} +{"step": 1216348160, "val/train_token_count": 1216348160, "val/train_batch_count": 580, "val/train_flop_count": 0, "val/train_total_time": 6737.260854291031, "val/train_update_time": 3918.229618334677, "val/loss": 5.619587636593296, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 100.2222726480104, "val/val_tokens_per_second": 408691.59037986683, "val/loss_avg_len_2048": 5.619587636593296, "val/perplexity_len_2048": 275.7756400022235, "val/loss_avg_len_1024": 5.640656256317569, "val/perplexity_len_1024": 281.647490784806, "val/loss_avg_len_512": 5.673037551006105, "val/perplexity_len_512": 290.9168684724483} +{"step": 1258291200, "val/train_token_count": 1258291200, "val/train_batch_count": 600, "val/train_flop_count": 0, "val/train_total_time": 6972.698619629024, "val/train_update_time": 4053.2279500714503, "val/loss": 5.606948659898109, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 100.518833424896, "val/val_tokens_per_second": 407485.8273261181, "val/loss_avg_len_2048": 5.606948659898109, "val/perplexity_len_2048": 272.31205232418614, "val/loss_avg_len_1024": 5.6283089039244, "val/perplexity_len_1024": 278.1912714547652, "val/loss_avg_len_512": 5.661133850814554, "val/perplexity_len_512": 287.47441092978147} +{"step": 1300234240, "val/train_token_count": 1300234240, "val/train_batch_count": 620, "val/train_flop_count": 0, "val/train_total_time": 7208.982709518052, "val/train_update_time": 4188.228603066411, "val/loss": 5.5957299809077465, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 100.08489269192796, "val/val_tokens_per_second": 409252.5744727456, "val/loss_avg_len_2048": 5.5957299809077465, "val/perplexity_len_2048": 269.27414334897264, "val/loss_avg_len_1024": 5.61745572001168, "val/perplexity_len_1024": 275.18833560693116, "val/loss_avg_len_512": 5.650942821657832, "val/perplexity_len_512": 284.5596283699001} +{"step": 1342177280, "val/train_token_count": 1342177280, "val/train_batch_count": 640, "val/train_flop_count": 0, "val/train_total_time": 7444.285434685065, "val/train_update_time": 4323.2266572538065, "val/loss": 5.584819928003452, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 100.3505922720069, "val/val_tokens_per_second": 408168.99106061296, "val/loss_avg_len_2048": 5.584819928003452, "val/perplexity_len_2048": 266.35231582754994, "val/loss_avg_len_1024": 5.606869975394156, "val/perplexity_len_1024": 272.29062642838284, "val/loss_avg_len_512": 5.640904106772971, "val/perplexity_len_512": 281.7173058951559} +{"step": 1384120320, "val/train_token_count": 1384120320, "val/train_batch_count": 660, "val/train_flop_count": 0, "val/train_total_time": 7679.831617571064, "val/train_update_time": 4458.218750593718, "val/loss": 5.574874783031576, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 100.00809445802588, "val/val_tokens_per_second": 409566.84778341826, "val/loss_avg_len_2048": 5.574874783031576, "val/perplexity_len_2048": 263.7165317847754, "val/loss_avg_len_1024": 5.597197585988651, "val/perplexity_len_1024": 269.6696215818559, "val/loss_avg_len_512": 5.631731116879685, "val/perplexity_len_512": 279.14493211217706} +{"step": 1426063360, "val/train_token_count": 1426063360, "val/train_batch_count": 680, "val/train_flop_count": 0, "val/train_total_time": 7915.0508393970085, "val/train_update_time": 4593.218119100784, "val/loss": 5.566041899083901, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 100.56229530100245, "val/val_tokens_per_second": 407309.71660301485, "val/loss_avg_len_2048": 5.566041899083901, "val/perplexity_len_2048": 261.39741160224924, "val/loss_avg_len_1024": 5.588593836321857, "val/perplexity_len_1024": 267.3594041821592, "val/loss_avg_len_512": 5.623624696305592, "val/perplexity_len_512": 276.8912130304534} +{"step": 1468006400, "val/train_token_count": 1468006400, "val/train_batch_count": 700, "val/train_flop_count": 0, "val/train_total_time": 8150.834610218997, "val/train_update_time": 4728.229743778473, "val/loss": 5.558274031948855, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 99.88718269299716, "val/val_tokens_per_second": 410062.62160672195, "val/loss_avg_len_2048": 5.558274031948855, "val/perplexity_len_2048": 259.3747771876053, "val/loss_avg_len_1024": 5.581072434914089, "val/perplexity_len_1024": 265.35603031714436, "val/loss_avg_len_512": 5.616466402893375, "val/perplexity_len_512": 274.9162217014239} +{"step": 1509949440, "val/train_token_count": 1509949440, "val/train_batch_count": 720, "val/train_flop_count": 0, "val/train_total_time": 8386.480246171006, "val/train_update_time": 4863.235517622321, "val/loss": 5.550198807562748, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 100.08558377600275, "val/val_tokens_per_second": 409249.7486118562, "val/loss_avg_len_2048": 5.550198807562748, "val/perplexity_len_2048": 257.2887017612347, "val/loss_avg_len_1024": 5.5732162203862625, "val/perplexity_len_1024": 263.27950391543754, "val/loss_avg_len_512": 5.609014750022057, "val/perplexity_len_512": 272.87525517970744} +{"step": 1551892480, "val/train_token_count": 1551892480, "val/train_batch_count": 740, "val/train_flop_count": 0, "val/train_total_time": 8621.791595970979, "val/train_update_time": 4998.250562901143, "val/loss": 5.543446067120804, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 100.18454685201868, "val/val_tokens_per_second": 408845.4885213135, "val/loss_avg_len_2048": 5.543446067120804, "val/perplexity_len_2048": 255.55715087625723, "val/loss_avg_len_1024": 5.5666869797617435, "val/perplexity_len_1024": 261.5660884209409, "val/loss_avg_len_512": 5.602842293289584, "val/perplexity_len_512": 271.19613196255517} +{"step": 1593835520, "val/train_token_count": 1593835520, "val/train_batch_count": 760, "val/train_flop_count": 0, "val/train_total_time": 8857.199786913, "val/train_update_time": 5133.253916564281, "val/loss": 5.537739042675431, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 100.4387403059518, "val/val_tokens_per_second": 407810.76978095865, "val/loss_avg_len_2048": 5.537739042675431, "val/perplexity_len_2048": 254.1028338277623, "val/loss_avg_len_1024": 5.561090528252709, "val/perplexity_len_1024": 260.1063350202336, "val/loss_avg_len_512": 5.59754501928999, "val/perplexity_len_512": 269.7633300665345} +{"step": 1635778560, "val/train_token_count": 1635778560, "val/train_batch_count": 780, "val/train_flop_count": 0, "val/train_total_time": 9092.876169198076, "val/train_update_time": 5268.2632564986125, "val/loss": 5.53244903423907, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 100.48596478602849, "val/val_tokens_per_second": 407619.11464172014, "val/loss_avg_len_2048": 5.53244903423907, "val/perplexity_len_2048": 252.76217686286913, "val/loss_avg_len_1024": 5.556028684659931, "val/perplexity_len_1024": 258.79304407566934, "val/loss_avg_len_512": 5.5928602047552936, "val/perplexity_len_512": 268.5024945930998} +{"step": 1677721600, "val/train_token_count": 1677721600, "val/train_batch_count": 800, "val/train_flop_count": 0, "val/train_total_time": 9328.580250757048, "val/train_update_time": 5403.266901573632, "val/loss": 5.527832486407514, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 100.72994552296586, "val/val_tokens_per_second": 406631.8093129649, "val/loss_avg_len_2048": 5.527832486407514, "val/perplexity_len_2048": 251.5979775419659, "val/loss_avg_len_1024": 5.551547197632486, "val/perplexity_len_1024": 257.6358612923782, "val/loss_avg_len_512": 5.5885610032757045, "val/perplexity_len_512": 267.3506261026088} +{"step": 1719664640, "val/train_token_count": 1719664640, "val/train_batch_count": 820, "val/train_flop_count": 0, "val/train_total_time": 9565.069234885043, "val/train_update_time": 5538.269277713727, "val/loss": 5.52392855829644, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 100.26257929799613, "val/val_tokens_per_second": 408527.2919047938, "val/loss_avg_len_2048": 5.52392855829644, "val/perplexity_len_2048": 250.61767189118544, "val/loss_avg_len_1024": 5.5477417929501565, "val/perplexity_len_1024": 256.65731564144943, "val/loss_avg_len_512": 5.584945974971866, "val/perplexity_len_512": 266.38589084546015} +{"step": 1761607680, "val/train_token_count": 1761607680, "val/train_batch_count": 840, "val/train_flop_count": 0, "val/train_total_time": 9800.571015445981, "val/train_update_time": 5673.296408364549, "val/loss": 5.520575026244429, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 100.78671431797557, "val/val_tokens_per_second": 406402.77121023956, "val/loss_avg_len_2048": 5.520575026244429, "val/perplexity_len_2048": 249.77862516707805, "val/loss_avg_len_1024": 5.5444952042206666, "val/perplexity_len_1024": 255.8254060580028, "val/loss_avg_len_512": 5.581906709400076, "val/perplexity_len_512": 265.5775024543716} +{"step": 1803550720, "val/train_token_count": 1803550720, "val/train_batch_count": 860, "val/train_flop_count": 0, "val/train_total_time": 10036.600267508999, "val/train_update_time": 5808.320603007567, "val/loss": 5.517816926866491, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 100.26595556503162, "val/val_tokens_per_second": 408513.53551838145, "val/loss_avg_len_2048": 5.517816926866491, "val/perplexity_len_2048": 249.09066007055384, "val/loss_avg_len_1024": 5.541841141779768, "val/perplexity_len_1024": 255.14732968290977, "val/loss_avg_len_512": 5.579388280792035, "val/perplexity_len_512": 264.90950597831915} +{"step": 1845493760, "val/train_token_count": 1845493760, "val/train_batch_count": 880, "val/train_flop_count": 0, "val/train_total_time": 10272.080307441996, "val/train_update_time": 5943.3299956357805, "val/loss": 5.515734384438093, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 100.35448634903878, "val/val_tokens_per_second": 408153.1527901874, "val/loss_avg_len_2048": 5.515734384438093, "val/perplexity_len_2048": 248.57245797864684, "val/loss_avg_len_1024": 5.53980909948076, "val/perplexity_len_1024": 254.62938593653982, "val/loss_avg_len_512": 5.577455671829229, "val/perplexity_len_512": 264.3980338896991} +{"step": 1887436800, "val/train_token_count": 1887436800, "val/train_batch_count": 900, "val/train_flop_count": 0, "val/train_total_time": 10507.649754095008, "val/train_update_time": 6078.3277447193395, "val/loss": 5.514074763978267, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 100.78998316905927, "val/val_tokens_per_second": 406389.59063318896, "val/loss_avg_len_2048": 5.514074763978267, "val/perplexity_len_2048": 248.16026417887818, "val/loss_avg_len_1024": 5.538199598210049, "val/perplexity_len_1024": 254.2198892474354, "val/loss_avg_len_512": 5.57595400317039, "val/perplexity_len_512": 264.00129360952474} +{"step": 1929379840, "val/train_token_count": 1929379840, "val/train_batch_count": 920, "val/train_flop_count": 0, "val/train_total_time": 10899.09726011497, "val/train_update_time": 6367.991133535514, "val/loss": 5.512881924894559, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 256.65374484693166, "val/val_tokens_per_second": 159592.45022677752, "val/loss_avg_len_2048": 5.512881924894559, "val/perplexity_len_2048": 247.86442539585198, "val/loss_avg_len_1024": 5.5370327444368685, "val/perplexity_len_1024": 253.92342480941105, "val/loss_avg_len_512": 5.574820502644323, "val/perplexity_len_512": 263.70221753780015} +{"step": 1971322880, "val/train_token_count": 1971322880, "val/train_batch_count": 940, "val/train_flop_count": 0, "val/train_total_time": 11471.464317596983, "val/train_update_time": 6683.226909449557, "val/loss": 5.5120874294808715, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 269.61758992495015, "val/val_tokens_per_second": 151918.87150761005, "val/loss_avg_len_2048": 5.5120874294808715, "val/perplexity_len_2048": 247.66757645480368, "val/loss_avg_len_1024": 5.536254765025649, "val/perplexity_len_1024": 253.72595443678495, "val/loss_avg_len_512": 5.574072946217401, "val/perplexity_len_512": 263.5051589156827} +{"step": 2013265920, "val/train_token_count": 2013265920, "val/train_batch_count": 960, "val/train_flop_count": 0, "val/train_total_time": 12034.565373313962, "val/train_update_time": 6976.259414628497, "val/loss": 5.511645271555128, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 284.5950597979827, "val/val_tokens_per_second": 143923.79133030312, "val/loss_avg_len_2048": 5.511645271555128, "val/perplexity_len_2048": 247.55809247931202, "val/loss_avg_len_1024": 5.535829944479786, "val/perplexity_len_1024": 253.6181893303579, "val/loss_avg_len_512": 5.573694263508974, "val/perplexity_len_512": 263.40539295943375} +{"step": 2055208960, "val/train_token_count": 2055208960, "val/train_batch_count": 980, "val/train_flop_count": 0, "val/train_total_time": 12586.090243482962, "val/train_update_time": 7242.765970333596, "val/loss": 5.511438396552292, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 301.2079832049785, "val/val_tokens_per_second": 135985.77157274692, "val/loss_avg_len_2048": 5.511438396552292, "val/perplexity_len_2048": 247.5068841952679, "val/loss_avg_len_1024": 5.535628381019627, "val/perplexity_len_1024": 253.5670743221894, "val/loss_avg_len_512": 5.573483581502771, "val/perplexity_len_512": 263.34990402826514} diff --git a/metrics/npz/train_eval/step-000000104857600.npz b/metrics/npz/train_eval/step-000000104857600.npz index a4055508669eaf253dc2f103f97635cbfc530e2a..262c55db969344ea05a642378f24efdd95a94174 100644 --- a/metrics/npz/train_eval/step-000000104857600.npz +++ b/metrics/npz/train_eval/step-000000104857600.npz @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0cc9ccb7aaac127061005cd6916da7a0fdc9141a6974c4ce9749f33bfaa724f2 +oid sha256:8cf5d84bbb384a5ed35bff5e67720db17263aaff3e72d07276587986862d9174 size 20540 diff --git a/metrics/npz/train_eval/step-000000209715200.npz b/metrics/npz/train_eval/step-000000209715200.npz index 98776dd7c646aff1b580b8f34e57ad46e43089b3..ef8982acccf45e2a57fce3746b193df123acf527 100644 --- a/metrics/npz/train_eval/step-000000209715200.npz +++ b/metrics/npz/train_eval/step-000000209715200.npz @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0aeceb03367da089f0498adc7839eac6c2c50c55310f643abc99139b5d11b4f7 +oid sha256:18da0fdb5a808db6ae8ce810f5cec11532442f666351bd5b4d8e1a75c64d0bb0 size 20540 diff --git a/metrics/npz/train_eval/step-000000314572800.npz b/metrics/npz/train_eval/step-000000314572800.npz index 92c7643912f609f162ed8db5a05da31feb2efba5..2cef44de5933d42b84cfc04a87b22e2fdad44e52 100644 --- a/metrics/npz/train_eval/step-000000314572800.npz +++ b/metrics/npz/train_eval/step-000000314572800.npz @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f0df27fda4881a26991e0c397b6e1e3a9d65baf937e9d001e99a68387e05b459 +oid sha256:88a5c5d61b855a58cddbd35d37b9601195794c15f09ddc3c5d20864caacb6a81 size 20540 diff --git a/metrics/npz/train_eval/step-000000419430400.npz b/metrics/npz/train_eval/step-000000419430400.npz index d52f50bcc2172c306770b6160b1a9bf62f3a3e51..4656449108e60e4c45c1470dfcfde35d4f97a656 100644 --- a/metrics/npz/train_eval/step-000000419430400.npz +++ b/metrics/npz/train_eval/step-000000419430400.npz @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:fe26f11b78bfb02f809af362b6c54ee2cbbc05dcb799bf5ce26f93239ec8ef14 +oid sha256:515dfb9054b9bbab1668e9065591fb7e7bab14730c9e82620a1ef456c43603e6 size 20540 diff --git a/metrics/npz/train_eval/step-000000524288000.npz b/metrics/npz/train_eval/step-000000524288000.npz index f585bca3f9c3c142447a1c933942ebaef214d620..ffc989b526e47ca58b57f4d944b0c2cc9a506a53 100644 --- a/metrics/npz/train_eval/step-000000524288000.npz +++ b/metrics/npz/train_eval/step-000000524288000.npz @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:19c245a7419211259996f35cf65b3624bce3fb528b539153943e31fd247051f5 +oid sha256:d65701667108388525ef127f105b1fe131f7ad2337ccf74943aeb0e33ecb6e33 size 20540 diff --git a/metrics/npz/train_eval/step-000000629145600.npz b/metrics/npz/train_eval/step-000000629145600.npz index e6b231ff025288e047de2cde53969d90c665768b..89cc1b23d25e256420f933a9d9d2cbfd32186dfa 100644 --- a/metrics/npz/train_eval/step-000000629145600.npz +++ b/metrics/npz/train_eval/step-000000629145600.npz @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f6d594540203718747de80aeed7d78f257f057ceabb17498738985fd7bf5cccf +oid sha256:58d04f720b85c446a2d4e374352e3b2ce81ac5e180e02ac5ad5e7e8ae48125bb size 20540 diff --git a/metrics/npz/train_eval/step-000000734003200.npz b/metrics/npz/train_eval/step-000000734003200.npz index cb0e6d1873b6aaf97401b602f0167d205f3e6f3c..616942f1ea33890fba51b2fa6e9593f159a369f0 100644 --- a/metrics/npz/train_eval/step-000000734003200.npz +++ b/metrics/npz/train_eval/step-000000734003200.npz @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9f47eed7277ce3e573dc8d3b1599962824dd5681d8412d943ac4821fc7ac6539 +oid sha256:45713c1b2c7ae72429b5954fa69c547976b917e01fb7d18f4afe19156a8060df size 20540 diff --git a/metrics/npz/train_eval/step-000000838860800.npz b/metrics/npz/train_eval/step-000000838860800.npz index 80abcca38021dc09e95ef7823cbdfd1c6c2e7d7d..b0099af12477b758a66e00a8873098c8aa73dbb8 100644 --- a/metrics/npz/train_eval/step-000000838860800.npz +++ b/metrics/npz/train_eval/step-000000838860800.npz @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4f766de285d162dea7f84d75b92f8f0af452d47c378929950b89467ca5aad3ee +oid sha256:e4e61cd36e7f7c9bd5418a50d7f1110c844524ae015cd8fff2385390fbfd546e size 20540 diff --git a/metrics/npz/train_eval/step-000000943718400.npz b/metrics/npz/train_eval/step-000000943718400.npz index d23f1fe968a81a556f1cadf042fda33552c23201..0ccea6e3a914b5cacf0fdffc1bded884e7a968da 100644 --- a/metrics/npz/train_eval/step-000000943718400.npz +++ b/metrics/npz/train_eval/step-000000943718400.npz @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:962ca33e9daf0eff241acee173d547c3784f67d895ab231209b622c4b8ddc50e +oid sha256:efd19824b71399982fd4597c3ec748e3ed162c03f0238a7afadf4912a03ec426 size 20540 diff --git a/metrics/npz/train_eval/step-000001048576000.npz b/metrics/npz/train_eval/step-000001048576000.npz index 0e107a9390688459a2562b8ecddc6f3f79c01721..6715838d47641e6e3e01918bf67a1aff8431077a 100644 --- a/metrics/npz/train_eval/step-000001048576000.npz +++ b/metrics/npz/train_eval/step-000001048576000.npz @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:40fa4c61ddc1addb7fda301757febf0bd1b923a5d6cd2610a9d495d0e1328db3 +oid sha256:d0aabd48e8e5a8bdf87e2079537c6d3ee09db8794637c4f2f848a6c71d6d3a61 size 20540 diff --git a/metrics/npz/train_eval/step-000001153433600.npz b/metrics/npz/train_eval/step-000001153433600.npz index 878fa1ba2c82ed71234b27c291916e545e4735de..081b12fafea17101ac7cae6df0045daff25ba956 100644 --- a/metrics/npz/train_eval/step-000001153433600.npz +++ b/metrics/npz/train_eval/step-000001153433600.npz @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:02de2b7a49c30cb6207648d3c5baf02cb2dcc10b956fd850354a2557eeaab7b0 +oid sha256:27308144d6b352e7fad503390e18ea8cfa021701a4cca346fea659dda63b9350 size 20540 diff --git a/metrics/npz/train_eval/step-000001258291200.npz b/metrics/npz/train_eval/step-000001258291200.npz index c17c47855678f0c11a039302b20a0afd9f925c79..f8a9511628dd4bc40541fdd7c7d26463b055ba7d 100644 --- a/metrics/npz/train_eval/step-000001258291200.npz +++ b/metrics/npz/train_eval/step-000001258291200.npz @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b56c36fa53729b512adff50d89e7641ff6aa48e7a2418ba2326ac4daa1f78999 +oid sha256:9d53fff2a6858900eca70ec6aab7aa4540f97d46ad666f12122dc29f8e358549 size 20540 diff --git a/metrics/npz/train_eval/step-000001363148800.npz b/metrics/npz/train_eval/step-000001363148800.npz index 610e6eb0084ec4b127004600fa78c73fb901da27..6600d87fae1b3ffc86b88d95ef1c1cfa7b4f0f97 100644 --- a/metrics/npz/train_eval/step-000001363148800.npz +++ b/metrics/npz/train_eval/step-000001363148800.npz @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6f7d719cd57293966756c88f928ac561941e0c39898b31c6d25c8c56f5c21e64 +oid sha256:aae4b5aefc00c7c0049e6436ccdf6865d96185ef36041a0c887ea3c19bda2df7 size 20540 diff --git a/metrics/npz/train_eval/step-000001468006400.npz b/metrics/npz/train_eval/step-000001468006400.npz index 7e99be267396e1c743bd308038b4cfa65377942b..e69a468959e0ebf399c1ba4aa971f189bb4206a0 100644 --- a/metrics/npz/train_eval/step-000001468006400.npz +++ b/metrics/npz/train_eval/step-000001468006400.npz @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d98de5240d31491fe46abe75f79278c60a12664aaaacb9ffabfee6c8daf8b4e6 +oid sha256:980560c5c69a512265240185c1c0622bdb2c08532e5a76a06bdbe2eec0c759af size 20540 diff --git a/metrics/npz/train_eval/step-000001572864000.npz b/metrics/npz/train_eval/step-000001572864000.npz index 1d061c4500a4bc013005f200fe830619039c5fa0..ff111c2ddcf3cdb24fcca2a7d82eb31bd036cb14 100644 --- a/metrics/npz/train_eval/step-000001572864000.npz +++ b/metrics/npz/train_eval/step-000001572864000.npz @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d12803667e8453755efe60cbe4ce89f1cd74bf9b272052f150335069da474cd1 +oid sha256:6edc41fdebf362fbe6b1bd8131d0cdb29155c836d8edfb6b51673dcbd5a21ed0 size 20540 diff --git a/metrics/npz/train_eval/step-000001677721600.npz b/metrics/npz/train_eval/step-000001677721600.npz index c455b93ebe92fc2f9504b9637591949f70c331b5..173fe5b9e52d498028cd2ed28c0a369a2c0ccd7e 100644 --- a/metrics/npz/train_eval/step-000001677721600.npz +++ b/metrics/npz/train_eval/step-000001677721600.npz @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5e8657c3c7d79428f8807bcd207213906927cd6a953b56ff4e6dc390d701763b +oid sha256:07474999739945278b5cab74934b30351f702800be0f68693ec4edc70a5f02bb size 20540 diff --git a/metrics/npz/train_eval/step-000001782579200.npz b/metrics/npz/train_eval/step-000001782579200.npz index af247b899014871d8ad46ec4d0c59458bdd07707..666e0148531f0039be0a78d5bd4ce7cd6a776d61 100644 --- a/metrics/npz/train_eval/step-000001782579200.npz +++ b/metrics/npz/train_eval/step-000001782579200.npz @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:73db4b8038acb5e7a90551f60aa0c797607fab9282878271d42738f6ecfe28ee +oid sha256:c69f954c643fb430cb55fc822dc547e5f1e5091eb54de1bf8c72d68b02871134 size 20540 diff --git a/metrics/npz/train_eval/step-000001887436800.npz b/metrics/npz/train_eval/step-000001887436800.npz index d45e9733e071bc793e7a3cb050219369f370a44f..ced5e64606b74694ee37f037576e61c4e5da0e57 100644 --- a/metrics/npz/train_eval/step-000001887436800.npz +++ b/metrics/npz/train_eval/step-000001887436800.npz @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:76b602173aeb194e597e04a7368859aadbf5fcd083d8c6cd0a2fe2e54457a3ec +oid sha256:e9f3a686cee7527e44101251343da7f82ac0e60f195c420effec6e59f0b89f19 size 20540 diff --git a/metrics/npz/train_eval/step-000001992294400.npz b/metrics/npz/train_eval/step-000001992294400.npz index 34f383a134a93d93f38c95de51f6fe2e19d8ae82..d0c131a605d3431d680bd06e63454243aa7b6452 100644 --- a/metrics/npz/train_eval/step-000001992294400.npz +++ b/metrics/npz/train_eval/step-000001992294400.npz @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:35a775ddbac055759dc01b8e2b248726be2e1aa670cd5e7d41aaf344c6a78e04 +oid sha256:4a421379bf34bc4da17cfb44894e6318dd0373d7f0796fb7a04f98ff97bd68d4 size 20540 diff --git a/metrics/npz/val/step-000000041943040.npz b/metrics/npz/val/step-000000041943040.npz index 9368a8422c55d0845328c63c3e063143630098e9..f4de96307d660fba50ccc5e35c75776046befc62 100644 --- a/metrics/npz/val/step-000000041943040.npz +++ b/metrics/npz/val/step-000000041943040.npz @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1cb48cd9dad7a056c937dd0f7b8f793ed49e2f3591d03fad8e00ac8fe953d864 +oid sha256:d460803409cd699e0c8d2c573ed45b4c5ad428d905ebb57bdd6599fb3104dd3a size 21142 diff --git a/metrics/npz/val/step-000000083886080.npz b/metrics/npz/val/step-000000083886080.npz index 495e4215012eddea5cebf3ae15eb29b353c75084..d935d963cac1c7b0f42ad12dd002bd4ba222c375 100644 --- a/metrics/npz/val/step-000000083886080.npz +++ b/metrics/npz/val/step-000000083886080.npz @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9840ddd877c006b1a3adbcf06b51cdb4ee6460ccfd7a0b72b850f9dee5bba675 +oid sha256:eb7c288497e770f3e8d3552e0ef8bf2483cfee12c60fa3c5017efbf1ff0e5088 size 21142 diff --git a/metrics/npz/val/step-000000125829120.npz b/metrics/npz/val/step-000000125829120.npz index 1c43762dd33c74cc4dc195d06c5f1555e08f30af..79c6ad6b9b2ef8cf69bcdc4ae2bacd29dd61b651 100644 --- a/metrics/npz/val/step-000000125829120.npz +++ b/metrics/npz/val/step-000000125829120.npz @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f70834d868afc0c8b82a7d4af8ce24c59be911e829ecd7df4a4993d897ad2879 +oid sha256:1650e1457fc57890b94c5edf45edd1fe51088f8bc9855555661e17c5a1c089d0 size 21142 diff --git a/metrics/npz/val/step-000000167772160.npz b/metrics/npz/val/step-000000167772160.npz index abfa693152590a40f8d9ea9600bc759eeb918a20..8cb0e44ea1f26b633ae11494355e87eac75cccf1 100644 --- a/metrics/npz/val/step-000000167772160.npz +++ b/metrics/npz/val/step-000000167772160.npz @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2a340a8edc847b8beb10b7fd1b6ba8cd5165493ab7ccfeef51f5646439ebd03e +oid sha256:45feabea573f1f8d25f3e4833046698d6b89bb375f8c09f16fabf986100a830d size 21142 diff --git a/metrics/npz/val/step-000000209715200.npz b/metrics/npz/val/step-000000209715200.npz index 586667a8555aafa86e50bf09caac771d47a412fc..d0c87399efd94a1a4fab93877d569fc1483a5d53 100644 --- a/metrics/npz/val/step-000000209715200.npz +++ b/metrics/npz/val/step-000000209715200.npz @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:33c4ef6eef84fb214a21b6f4d5c761438d49470a2a6e81ab2194eba9f5e3de06 +oid sha256:681de738dd504e8a33ad756187079d4be03cfe6ec2a26e5591b72bf9c606b821 size 21142 diff --git a/metrics/npz/val/step-000000251658240.npz b/metrics/npz/val/step-000000251658240.npz index 6ac62984d4e72349421778d1c46402d6d8bb57e6..4ad382c7d28d51266211b155ebc2953351b844d7 100644 --- a/metrics/npz/val/step-000000251658240.npz +++ b/metrics/npz/val/step-000000251658240.npz @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f69f3a0b936ec88bc6005a43574a60bc91a8d147e74b1db3e8694db8ea5940ad +oid sha256:cb9faffd72382fa63f6da5f9ecbab8d65aba813c7bb323e9b6dae78b0627f8ca size 21142 diff --git a/metrics/npz/val/step-000000293601280.npz b/metrics/npz/val/step-000000293601280.npz index f218a3be3891e84a5fcffe5d2265638f0565bb70..7ef940b7906ae9324af134d416bba226ceff282b 100644 --- a/metrics/npz/val/step-000000293601280.npz +++ b/metrics/npz/val/step-000000293601280.npz @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:71663e255c36887b4ddc18ece04f79564f440a87a64d87986934a0312b4d4f21 +oid sha256:0dca3d3ff0bc4173d37c70d15d7832157eda1299d9e9513c6aadf6f2083e86b4 size 21142 diff --git a/metrics/npz/val/step-000000335544320.npz b/metrics/npz/val/step-000000335544320.npz index 8b86e3ee24322572dabb8d6a684f572f129d8368..fd9555befba73b08ddbf56427e24de8ebf1b8b62 100644 --- a/metrics/npz/val/step-000000335544320.npz +++ b/metrics/npz/val/step-000000335544320.npz @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:583ea9cf5717a91776de447a997a9602c8e124e32eaa39b81bc8770c3585d73b +oid sha256:1a6dc5f351ea6c0c2b785ef1133891cf55c9916bdad5ee9f6050bcaae818dfeb size 21142 diff --git a/metrics/npz/val/step-000000377487360.npz b/metrics/npz/val/step-000000377487360.npz index ce6f71ce7d571503d04b6ecb434135ec5ff69ccd..0971382766beb61de531a356f0a673df87250733 100644 --- a/metrics/npz/val/step-000000377487360.npz +++ b/metrics/npz/val/step-000000377487360.npz @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:17b4c8432b92d6f010fa3f29e1682b2b024a8c3006e42ad0e4f31a4b438b30db +oid sha256:80ade23d0ab226d2614f79d41c922defbdeb79a6f5acfe1a0dc05fb43c870080 size 21142 diff --git a/metrics/npz/val/step-000000419430400.npz b/metrics/npz/val/step-000000419430400.npz index c5d328c3cba70e6ba96c43bf065fbaac468a6c18..7ca40cb6e537a506fea2a544ee080214f9a3d21c 100644 --- a/metrics/npz/val/step-000000419430400.npz +++ b/metrics/npz/val/step-000000419430400.npz @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7e7adae8c8b3080833220d7ee882f2b757c56baf74c4ca9fa2a35b95fac4315a +oid sha256:97c2596c96210e2712ccfdbb24164f6770b2f8fb0dc626a9b6cee9c365128123 size 21142 diff --git a/metrics/npz/val/step-000000461373440.npz b/metrics/npz/val/step-000000461373440.npz index e0f84adf091c3bdbede63d4a6610ccb8d1cf3517..28ef28de605911c4435f507b7046348f51a0d6bd 100644 --- a/metrics/npz/val/step-000000461373440.npz +++ b/metrics/npz/val/step-000000461373440.npz @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9c9923a7f84bfbe4751c87849440ce1b844c84f2482cdc0b2cb9c1c43828903e +oid sha256:f6277c7139d3759cebb5da02c17c213ab3e4f2c887282efa2288af945ea8ef32 size 21142 diff --git a/metrics/npz/val/step-000000503316480.npz b/metrics/npz/val/step-000000503316480.npz index 80c6c5f382e7f0168761da17f12e6be73ec828c2..c07c975db0298583c72680351a004efdfd5f9e33 100644 --- a/metrics/npz/val/step-000000503316480.npz +++ b/metrics/npz/val/step-000000503316480.npz @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f0a71a11a871e7e11e59d5e2d4a3290bc20c839173c4bb53d02d5bb0b3a007c7 +oid sha256:9fdb197bff11310adfb3639299c84d085bc32638ccd6672008cc8302ccb47195 size 21142 diff --git a/metrics/npz/val/step-000000545259520.npz b/metrics/npz/val/step-000000545259520.npz index 3a2f508f7685416cc2535aaee42860129e1a959a..291a8791f5efba477bc19bcbf9e0f8d913a5b0ab 100644 --- a/metrics/npz/val/step-000000545259520.npz +++ b/metrics/npz/val/step-000000545259520.npz @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5d37be56f811d3a807d84f3842d45c2fd1d9b6ca34d707593302d8712ed2c258 +oid sha256:22a003810f69405729520bb8a263994ed389e6c8c613cc97fd326067130b16fc size 21142 diff --git a/metrics/npz/val/step-000000587202560.npz b/metrics/npz/val/step-000000587202560.npz index 783394cbdb1f68e1bcad6949a8b30107966eca9c..9da3de58f2ec25a02f30f609f36a9c382c0706c7 100644 --- a/metrics/npz/val/step-000000587202560.npz +++ b/metrics/npz/val/step-000000587202560.npz @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:efa731ed866f7a692cc2135c56d6581695ae0317f2260695e2eca31770c100e0 +oid sha256:73340fd8a624f2c43731a6613020584c9fc332ae4b24dcbd039350c33bec60b6 size 21142 diff --git a/metrics/npz/val/step-000000629145600.npz b/metrics/npz/val/step-000000629145600.npz index 9519e0541a32d37564057a89332bef0f3751a029..f9e884903efe3cb62128b630f46faaad7eb27d33 100644 --- a/metrics/npz/val/step-000000629145600.npz +++ b/metrics/npz/val/step-000000629145600.npz @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:aa37dbfcd43f3cfd09768878d0110080f79934cd3a9bea5db100494b929024c6 +oid sha256:ee4523f8039fd23d156ec119aab8db1ff40b491e62b3614cbec6be0eca23354e size 21142 diff --git a/metrics/npz/val/step-000000671088640.npz b/metrics/npz/val/step-000000671088640.npz index 22752dbe9325fde6f256bd8c696209a2e7e7faab..52053bba9b1bc895934bcae24632ff060bbdce1b 100644 --- a/metrics/npz/val/step-000000671088640.npz +++ b/metrics/npz/val/step-000000671088640.npz @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6256df857e5e95450ab7896f82a149f5f68a4aff82341916a33211be318dbe3d +oid sha256:ac171ba184ea8ffbf929a3a4bcf0690d19213198c2bc33fb184ef0d3c143dab1 size 21142 diff --git a/metrics/npz/val/step-000000713031680.npz b/metrics/npz/val/step-000000713031680.npz index 01c9bc5366963c07606d6f07280184a3506b9a0d..aaea742481d8d1667f4ad52ea5a02763bd9eb89f 100644 --- a/metrics/npz/val/step-000000713031680.npz +++ b/metrics/npz/val/step-000000713031680.npz @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a894a8cb8303f94f51bd0f125a8ae3cce18121c583b6270a6576c6cbcf742d31 +oid sha256:4156f92845ef78bd18454f2f12dc06cbe9dbcdb2c0000476df35f736eb744321 size 21142 diff --git a/metrics/npz/val/step-000000754974720.npz b/metrics/npz/val/step-000000754974720.npz index e50516cf8dd2750f72268a9cf302fb77ff8daaaa..3b53a06e711915f9b844d3e3ffca2656825bb29d 100644 --- a/metrics/npz/val/step-000000754974720.npz +++ b/metrics/npz/val/step-000000754974720.npz @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:37e52f733decc48682e4ba9dbb7c5357a87c2d7959cb71f8148f14610c53e4ce +oid sha256:34d4bc0f958899f45f3e732e963b9113dc92c4828a7074511ac1f7f6a0d2db53 size 21142 diff --git a/metrics/npz/val/step-000000796917760.npz b/metrics/npz/val/step-000000796917760.npz index 2cec19995ab93919ed770e63ca8c41ac10bf19d5..bafc5e4085ec3afe056340c9b9fa8475752f91ce 100644 --- a/metrics/npz/val/step-000000796917760.npz +++ b/metrics/npz/val/step-000000796917760.npz @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:fc67a29c7b901a43e0413ec6bc70907400db920012fc2a9572fa6d5b0c6092b7 +oid sha256:fcb244fb8026ab2fcaf59693a59746e79f32a64a5c0612e79d2193259428c402 size 21142 diff --git a/metrics/npz/val/step-000000838860800.npz b/metrics/npz/val/step-000000838860800.npz index 50308c14da29bc5f34ac46260c8e5d7233469d27..c427a27f71bc22677c14ed30220aa673531a828b 100644 --- a/metrics/npz/val/step-000000838860800.npz +++ b/metrics/npz/val/step-000000838860800.npz @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3ee97f16254e69d1033751b2efbe8917ea8732bc4d45ecc616b74419830ba5dc +oid sha256:027ffe5cd5aa1a7a9205e2c5a59554f79ab8d20dfc7c5ed1ea3275a20f467776 size 21142 diff --git a/metrics/npz/val/step-000000880803840.npz b/metrics/npz/val/step-000000880803840.npz index e4878e4f56ad9602ca92b9770b15633ad82a1068..98d0b9116d65fbb30822bcfc8112462137a3ee56 100644 --- a/metrics/npz/val/step-000000880803840.npz +++ b/metrics/npz/val/step-000000880803840.npz @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9b4122d5c163fd41d0c029233304b249f7b00eae4931ba08c97d4de052dbf9f4 +oid sha256:d0dbab064f3c2ed0cc73fde71af1ceb9ef358ed809a73231bcab616a679bd477 size 21142 diff --git a/metrics/npz/val/step-000000922746880.npz b/metrics/npz/val/step-000000922746880.npz index 204888505c432cf0d6e90ef1022df59cb83efb87..97d1fc087e9fdad7130dd79dcfb95fbc297d7019 100644 --- a/metrics/npz/val/step-000000922746880.npz +++ b/metrics/npz/val/step-000000922746880.npz @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5ca0c2ff899c5fbc62232fdb770b69342c9f41cc022f623a1151997706415cf0 +oid sha256:269279c4860dbb7aa4668f5d62f7d135518edcf3f5ebf0a82e4fda5839451c9e size 21142 diff --git a/metrics/npz/val/step-000000964689920.npz b/metrics/npz/val/step-000000964689920.npz index d49871706730dd2457a087174df880f7ba2b6e38..e48b0cd29aad17236ca4d64e8fa2406487b38a5d 100644 --- a/metrics/npz/val/step-000000964689920.npz +++ b/metrics/npz/val/step-000000964689920.npz @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:537e932d42175a1e4b6d2791457b0aa068802366f6ebae71205efb5995f64ab3 +oid sha256:88dca8813ef834c5aa0aa9c1e2307e530a4f32c8f22503ee5bad6c64472f7c64 size 21142 diff --git a/metrics/npz/val/step-000001006632960.npz b/metrics/npz/val/step-000001006632960.npz index 532c888e7d6f79a2a4d2524a6e78c9c98c3c81e1..a7fa4969dcf5ab3021fdc45235c4e37c1f4a3d47 100644 --- a/metrics/npz/val/step-000001006632960.npz +++ b/metrics/npz/val/step-000001006632960.npz @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c5b1f740abde5dc95f4dbf2b180dd3c6e3db70639b83d4ae4a61b96f1bdb559e +oid sha256:c8063b0c2d3bfcfd67cc10721df50338da3aca26be63191d3b60d1b357507d1e size 21142 diff --git a/metrics/npz/val/step-000001048576000.npz b/metrics/npz/val/step-000001048576000.npz index 61d8e1e8bab2120f10946cb218a45136b4d598f4..ecf89e05f06ebbb538c2826f137098772a3db850 100644 --- a/metrics/npz/val/step-000001048576000.npz +++ b/metrics/npz/val/step-000001048576000.npz @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:99461c8c138c3eafb136dafb66fbe2ddaa35fa7b517f773502b2e637258a56d4 +oid sha256:f5409602f30bca23fc109b3d165ff29d3ad68d5756152047e816783f0ef9aa10 size 21142 diff --git a/metrics/npz/val/step-000001090519040.npz b/metrics/npz/val/step-000001090519040.npz index b2f92e5ce065c0c7f901d60dfd6e45c3ec84c756..f70df9b5cbc484e5fc4259bc4c39b12a526dd153 100644 --- a/metrics/npz/val/step-000001090519040.npz +++ b/metrics/npz/val/step-000001090519040.npz @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c2abe4163fa5530c2ca5e6902dd532976dd59408db5d5b64907e55bc8072a7df +oid sha256:fb0d18d49ff97bc62c1e87abc1cc14a94bc1425514c2067176acde416e595c8b size 21142 diff --git a/metrics/npz/val/step-000001132462080.npz b/metrics/npz/val/step-000001132462080.npz index 237419738a678632547a8a5b1043e918133844ba..154108d1fe097e0c26c9cc43452af59531d9f585 100644 --- a/metrics/npz/val/step-000001132462080.npz +++ b/metrics/npz/val/step-000001132462080.npz @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8ebc87a5428c4cd4b03f09730824877b59ecbdd68df3665a69a0a64858e9b833 +oid sha256:057e1cf056623b634752f128a2330758d1f1467e4c3d90b4ba69c570fbe40db2 size 21142 diff --git a/metrics/npz/val/step-000001174405120.npz b/metrics/npz/val/step-000001174405120.npz index ae55985af02b23e94ed25e5b5241342a1e3999e8..2b69a86d3639f2afed93392eafec6a9884698fe9 100644 --- a/metrics/npz/val/step-000001174405120.npz +++ b/metrics/npz/val/step-000001174405120.npz @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:572b4f74ee879984ae0620f4b59f5478f99bc5281013e087c1adb08f42e2ce7b +oid sha256:c236f2a524cf7bbf10044a5c58eff266fc7348f8bc7dca8c6d706cae34a76a65 size 21142 diff --git a/metrics/npz/val/step-000001216348160.npz b/metrics/npz/val/step-000001216348160.npz index c742a3a63bba1794eb5b29901d2e97aa4479a733..fee271762fba40c23cdbf833505427bcfb1f8a43 100644 --- a/metrics/npz/val/step-000001216348160.npz +++ b/metrics/npz/val/step-000001216348160.npz @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1bd997bf7ac6b9f1bad61378622d953abe55f64b4bfa10f3148fd879d0ee9824 +oid sha256:f41d9cf0d4c3b8577bf9aa7f2cb7b6ab3627f6afc2d5a49229c33a595756fe27 size 21142 diff --git a/metrics/npz/val/step-000001258291200.npz b/metrics/npz/val/step-000001258291200.npz index 051ea6585d79bbb9454c024fd7c9d97033c48387..4eef2156a3fac6d72c196938507a44daa1b43386 100644 --- a/metrics/npz/val/step-000001258291200.npz +++ b/metrics/npz/val/step-000001258291200.npz @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:50893d24549f21138942ca3c6d75627494a276629b8efc396308b8b28e47df21 +oid sha256:c218089424321d3f7e4e8df401e831290888c73e576c70cef6628d2f082e4c7e size 21142 diff --git a/metrics/npz/val/step-000001300234240.npz b/metrics/npz/val/step-000001300234240.npz index 4207315efa8095540a444dc99f67fb4d1827ed83..ff7528a605067f0f353d37bebc5cde95c45ead98 100644 --- a/metrics/npz/val/step-000001300234240.npz +++ b/metrics/npz/val/step-000001300234240.npz @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:55b4179e2ab9c7e174119e4ab49669bc68cfde0adf6830dcea4c4104b812cf4f +oid sha256:b0dc90714bc974ab1a485f9b756a8e0a8f1d1065d8295f0fc429c6ce5fd3db5a size 21142 diff --git a/metrics/npz/val/step-000001342177280.npz b/metrics/npz/val/step-000001342177280.npz index f9ddeb82ab83908a2cddc73a859c5944107d68ca..db0cc04e30c1bffc6bcf99c7efa2dc6c9bb71c5e 100644 --- a/metrics/npz/val/step-000001342177280.npz +++ b/metrics/npz/val/step-000001342177280.npz @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:660d569b87d4206dae695aa5d22cd0e0bef4b9a838c74559eead93afc9b2f4ce +oid sha256:42d551acc2ce288fdd1f915cf87ec9441bf5c2c9c28db5d7cc10c8e3413fffcc size 21142 diff --git a/metrics/npz/val/step-000001384120320.npz b/metrics/npz/val/step-000001384120320.npz index 35a4b9173653f755ddca86887cab90ef1548724d..1134428482ff8c2af0083de96d28017a22071585 100644 --- a/metrics/npz/val/step-000001384120320.npz +++ b/metrics/npz/val/step-000001384120320.npz @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e75ddca22c1b5fc75bec9bd81300e6e3aa01b83ceca7d20383b9173c43ff6651 +oid sha256:b25a6747fec7dc0c4f963c72de994d4f7761a9c33a280b767cb86a63fad9c2c1 size 21142 diff --git a/metrics/npz/val/step-000001426063360.npz b/metrics/npz/val/step-000001426063360.npz index 277d66921e6fb73850271a00b006190392f6c573..4c212b313400fafca7780e2145b97cc8b070ef3d 100644 --- a/metrics/npz/val/step-000001426063360.npz +++ b/metrics/npz/val/step-000001426063360.npz @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:421c5157953aaebdcceeadb38b40786425c6f54430bbd631bc82d435802d4e4c +oid sha256:9f32daf3e1038bc48eb7b7553a8cc033027c535b260e38896d50bfa984c32555 size 21142 diff --git a/metrics/npz/val/step-000001468006400.npz b/metrics/npz/val/step-000001468006400.npz index fe1979383832181078403b903f07d7a49e3c12d0..37f5127c40059bba974b5edc3fe7dfd0fad1a7f0 100644 --- a/metrics/npz/val/step-000001468006400.npz +++ b/metrics/npz/val/step-000001468006400.npz @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e09a19f5548db951758c282826f996ea740d907d6c5720ebaaf7ff8ee929c6c2 +oid sha256:5d4cbfc34600232ae9a0704c6ef2b882cf58b3bbc851d89036a67a64a5daa0ef size 21142 diff --git a/metrics/npz/val/step-000001509949440.npz b/metrics/npz/val/step-000001509949440.npz index cc526b76fab5a1403ba8b5f8ad5224122702cdd0..8fe2b1fb0be0ac57c9ad0700cb0cb1297f381130 100644 --- a/metrics/npz/val/step-000001509949440.npz +++ b/metrics/npz/val/step-000001509949440.npz @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:db42825e304be5bdf3cccb781cf115bac48d58c458b2ba48d290beba8ff74bed +oid sha256:6974e23122e3bf87c247d3726d1f8ce307bc0e88e71901e5d30d6dcaa3c0e236 size 21142 diff --git a/metrics/npz/val/step-000001551892480.npz b/metrics/npz/val/step-000001551892480.npz index fdb0d31be4c579d8398f7c9a35842212e0ee12a2..82e3953cdfbdb5261cffb8e6168cdf7866133d80 100644 --- a/metrics/npz/val/step-000001551892480.npz +++ b/metrics/npz/val/step-000001551892480.npz @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2baef59cebbfaf5bb32066ccf6a2d00891321c36f4d11428b5a3c2bcac922b42 +oid sha256:039944c2b0e65e4462e9f9eebe1abe4c371b8b67295c5c84060b510ac6ee9ae7 size 21142 diff --git a/metrics/npz/val/step-000001593835520.npz b/metrics/npz/val/step-000001593835520.npz index d0c0abd125468fc825e5f463d89b712d505ac670..8bcf0b9f343c4306e04f2a5c60bdd51efc7fb702 100644 --- a/metrics/npz/val/step-000001593835520.npz +++ b/metrics/npz/val/step-000001593835520.npz @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:597d5da02cc3f7f06648c1ecfb27ecd346b74079f69f0df5ad3cb43a0e404f23 +oid sha256:f53dcbcc12582b9166a800d718b449daf286f1eb2bfa9d0748534b76321320af size 21142 diff --git a/metrics/npz/val/step-000001635778560.npz b/metrics/npz/val/step-000001635778560.npz index 16622da6f13636235ae1a518b508415a0a901b88..7650d47c448f6c55fd972fe044b9deb3e3ab5a85 100644 --- a/metrics/npz/val/step-000001635778560.npz +++ b/metrics/npz/val/step-000001635778560.npz @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:306f9817a86ef8e7cb953127973a6efbfc0b0157f4e954a526deb9458fd63231 +oid sha256:01d9585a865a7cc9ed0506a33a71bfa65e48d5820a0ac9e4194a9dc7e93f55fc size 21142 diff --git a/metrics/npz/val/step-000001677721600.npz b/metrics/npz/val/step-000001677721600.npz index 3cc6b6ff6836ff08517f7dac833cd9d1ad39c721..7e060cbe4ecfbbd5bd7f8d9ff344a2f4f08f47c0 100644 --- a/metrics/npz/val/step-000001677721600.npz +++ b/metrics/npz/val/step-000001677721600.npz @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:78f33b506d9071b745d475ba66c6ddaaf2cc7874262b1cd0b7bd240eba454e36 +oid sha256:7ce3531fa9e8c02c787fe7e7a0cee36fcd84c7e7a4fe2aff556cab0a5de5c55b size 21142 diff --git a/metrics/npz/val/step-000001719664640.npz b/metrics/npz/val/step-000001719664640.npz index 72753410e16c74228e44ea53d63b7939855b5745..6b6b95cf4590abf073cd8f53488fea45f98dc71c 100644 --- a/metrics/npz/val/step-000001719664640.npz +++ b/metrics/npz/val/step-000001719664640.npz @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f02138b6583f5ce43b042db7c2b687585f13f61fc5cf3951990688f7161b078d +oid sha256:97c7607d6b41a28f26a5f1d6bf82d6a25a22e8e75b21caa917c26006e293ca26 size 21142 diff --git a/metrics/npz/val/step-000001761607680.npz b/metrics/npz/val/step-000001761607680.npz index 4c7443deb76240d569b94fad8c2a3344e1653186..39dfe17df359c33b43b96b578d59d3f46b72f1f9 100644 --- a/metrics/npz/val/step-000001761607680.npz +++ b/metrics/npz/val/step-000001761607680.npz @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c69e733928b15ba6001db3f5ee0f36c3489292ebd43a4eee958fbd50f9264ae7 +oid sha256:332746921c77da9207084daa58dd9b224ca4483eb11702735ed973d900edfdb7 size 21142 diff --git a/metrics/npz/val/step-000001803550720.npz b/metrics/npz/val/step-000001803550720.npz index 129f21d17c46546a7cf2bd9cf69ad4a89d3a6a8e..df976f4868de2c89a1e5dcc8786519c9869df9f5 100644 --- a/metrics/npz/val/step-000001803550720.npz +++ b/metrics/npz/val/step-000001803550720.npz @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:cb8fd3fc60b8d011670cf05f6df10b483fce7b8e4610f048226c67db5bf2c31b +oid sha256:59193d540b0ea8fc308397f411b418d5b349d9ef0ef1a4100bf605aa56a9f7b0 size 21142 diff --git a/metrics/npz/val/step-000001845493760.npz b/metrics/npz/val/step-000001845493760.npz index b018dfbab672094ef1d4ef836f9669f25ed39670..885519f53f52b25de306c4b8b49937aa65b016ab 100644 --- a/metrics/npz/val/step-000001845493760.npz +++ b/metrics/npz/val/step-000001845493760.npz @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e12b688c621ca45cd4d5a73566d9f2334e8b82f0f5233ea0172c21e302d76a55 +oid sha256:892ca9df8e0b68ae97adaa9efee518c52641d8f11914fd16b75bbce6fa2c3f0d size 21142 diff --git a/metrics/npz/val/step-000001887436800.npz b/metrics/npz/val/step-000001887436800.npz index 6d8e947df939ec1b9c8e805295e01b6d0ea1e156..e811d46e594129e7e2c688ac77e987966b4d621d 100644 --- a/metrics/npz/val/step-000001887436800.npz +++ b/metrics/npz/val/step-000001887436800.npz @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b5c7eee2b106073279232f7787e689f8de15abd7a0d1bf3de99148803ee7843e +oid sha256:bf5f7ff5248e4351941ad3aec4a256191530a9565f6d20a18c8fe93f5b8a49e0 size 21142 diff --git a/metrics/npz/val/step-000001929379840.npz b/metrics/npz/val/step-000001929379840.npz index 1414ce2d684643ee2f9cc1e421f6de2f1d9da3b0..da145af47bc5e9ce2b3f7767c5e05d9ce12449aa 100644 --- a/metrics/npz/val/step-000001929379840.npz +++ b/metrics/npz/val/step-000001929379840.npz @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2a2ed1271198b8032720f1bc9e1c15475a848a08d80d6e7c4adac4a28c1233b2 +oid sha256:90f227e48efe7f2c01743a1e0bf5d2f319bc181c2e0166b47332953ea28a7825 size 21142 diff --git a/metrics/npz/val/step-000001971322880.npz b/metrics/npz/val/step-000001971322880.npz index f2957ee42fd7fe2e7e8917d60dfc0f489c7c7c94..e5d8b8fcf83d5317afd2e678307036f8dd8396ff 100644 --- a/metrics/npz/val/step-000001971322880.npz +++ b/metrics/npz/val/step-000001971322880.npz @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:fc057e8a3cde351679f352b6c3b5ca2198dd70fac5b5e7734a3387913f00c6e2 +oid sha256:e3d578a90588624ca0366553dcb59b797bdc02c8b1f8cc2c473cc6ab11b44075 size 21142 diff --git a/metrics/npz/val/step-000002013265920.npz b/metrics/npz/val/step-000002013265920.npz index 20b4bc89ea4ca8e1a361f11511bbbae9fabef71d..946e4237ce679ea76bf85d417428a3109396193b 100644 --- a/metrics/npz/val/step-000002013265920.npz +++ b/metrics/npz/val/step-000002013265920.npz @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:88501a8da55715356174a5b97788ddfc2d81b4c09883f174000e6ef2b66ce879 +oid sha256:00cd035e084cd894200b2d07f53bb3815933ffd097701e2f8650ff4b1ea69567 size 21142 diff --git a/metrics/npz/val/step-000002055208960.npz b/metrics/npz/val/step-000002055208960.npz index 3d6564e4d715d68a4ef86c315908adfd811a626b..3b0c0c550412aa320d54bd5e8bc84861990797c3 100644 --- a/metrics/npz/val/step-000002055208960.npz +++ b/metrics/npz/val/step-000002055208960.npz @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7b3ea106bbb108da2ecb210a49a9de8993094893278f611b2a92d8f548c240e4 +oid sha256:b527ff1396eff00600b35df13247a545e4e8f6c8465a1341fc7d8473c6943de3 size 21142 diff --git a/metrics/wandb/wandb_run_id.txt b/metrics/wandb/wandb_run_id.txt index 337b6a3fe8d624a15f4331d460e2fc6915174dcf..8b9ea5cab9cbd5cd077d5089871f2d8093141791 100644 --- a/metrics/wandb/wandb_run_id.txt +++ b/metrics/wandb/wandb_run_id.txt @@ -1 +1 @@ -e8tgj17j \ No newline at end of file +gwmp4t3h \ No newline at end of file diff --git a/model.txt b/model.txt index d5a7180d0a8413695e4ff6cade7069e0e5926476..50eb592ac1cdfeedd30e8d6f8a73d6d1bddeefe0 100644 --- a/model.txt +++ b/model.txt @@ -1,24 +1,21 @@ AlibiForCausalLM( - (model): AlibiModel( - (embeddings): Embedding(50277, 256) - (layers): ModuleList( - (0-1): 2 x TransformerBlock( - (attn_norm): RMSNorm(256, eps=1e-06) - (attn): Attention( - (q_proj): Linear(in_features=256, out_features=256, bias=False) - (k_proj): Linear(in_features=256, out_features=256, bias=False) - (v_proj): Linear(in_features=256, out_features=256, bias=False) - (o_proj): Linear(in_features=256, out_features=256, bias=False) - ) - (mlp_norm): RMSNorm(256, eps=1e-06) - (mlp): TransformerMLP( - (gate_proj): Linear(in_features=256, out_features=1536, bias=False) - (down_proj): Linear(in_features=768, out_features=256, bias=False) - (act_fn): SiLU() - ) + (emb): Embedding(50277, 256) + (layers): ModuleList( + (0-1): 2 x TransformerBlock( + (attn_norm): RMSNorm(256, eps=1e-06) + (attn): Attention( + (q_proj): Linear(in_features=256, out_features=256, bias=False) + (k_proj): Linear(in_features=256, out_features=256, bias=False) + (v_proj): Linear(in_features=256, out_features=256, bias=False) + (o_proj): Linear(in_features=256, out_features=256, bias=False) + ) + (mlp_norm): RMSNorm(256, eps=1e-06) + (mlp): TransformerMLP( + (gate_proj): Linear(in_features=256, out_features=1536, bias=False) + (down_proj): Linear(in_features=768, out_features=256, bias=False) ) ) - (norm): RMSNorm(256, eps=1e-06) ) + (norm): RMSNorm(256, eps=1e-06) (lm_head): Linear(in_features=256, out_features=50277, bias=False) ) diff --git a/no_decay_params.txt b/no_decay_params.txt index 7acf4201eb540c492bd36f923aa1466d9d07cb09..83056d891dbdc5a102994f71ed0098a627804cfc 100644 --- a/no_decay_params.txt +++ b/no_decay_params.txt @@ -1,5 +1,5 @@ -_forward_module._fsdp_wrapped_module.model.layers.0.attn_norm.weight -_forward_module._fsdp_wrapped_module.model.layers.0.mlp_norm.weight -_forward_module._fsdp_wrapped_module.model.layers.1.attn_norm.weight -_forward_module._fsdp_wrapped_module.model.layers.1.mlp_norm.weight -_forward_module._fsdp_wrapped_module.model.norm.weight +_forward_module._fsdp_wrapped_module.layers.0.attn_norm.weight +_forward_module._fsdp_wrapped_module.layers.0.mlp_norm.weight +_forward_module._fsdp_wrapped_module.layers.1.attn_norm.weight +_forward_module._fsdp_wrapped_module.layers.1.mlp_norm.weight +_forward_module._fsdp_wrapped_module.norm.weight