diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_70B/epoch0/consolidated.00-of-01.model.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_70B/epoch0/consolidated.00-of-01.model.pth deleted file mode 100644 index e852871fcbc031423feef4f4b5b5650cfcd2a366..0000000000000000000000000000000000000000 --- a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_70B/epoch0/consolidated.00-of-01.model.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:845d95e7a891af4aa67d67010d0345372ba47972219b6584a84276a2d24f7b4c -size 297203748 diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_70B/epoch0/consolidated.00-of-01.optimizer.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_70B/epoch0/consolidated.00-of-01.optimizer.pth deleted file mode 100644 index cc5df57257948a2767ae863ad1bf94360a8701de..0000000000000000000000000000000000000000 --- a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_70B/epoch0/consolidated.00-of-01.optimizer.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:bdfa2eb02a223643fd49768731adaf8b51dc7ae98c0a461826366103f80c8bd5 -size 652420087 diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_70B/epoch0/consolidated.00-of-01.other.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_70B/epoch0/consolidated.00-of-01.other.pth deleted file mode 100644 index cb618fd2b256428f0c67a5c1d6c6b868b047e8a0..0000000000000000000000000000000000000000 --- a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_70B/epoch0/consolidated.00-of-01.other.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:9c5889268866aaf511f8591d23816d7faa96bed89b07daff596fe31445d6abc6 -size 1751 diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_70B/epoch0/rank-specific-00000-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_70B/epoch0/rank-specific-00000-of-00008.pth deleted file mode 100644 index 52b71af1a9ce3ed182e1185cac54dc42f12a5fb6..0000000000000000000000000000000000000000 --- a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_70B/epoch0/rank-specific-00000-of-00008.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:ec2932635da1a4de71c34aa8fcbcba91dfb0ac1ddc7859f8f87280546b7e786a -size 537 diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_70B/epoch0/rank-specific-00001-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_70B/epoch0/rank-specific-00001-of-00008.pth deleted file mode 100644 index 20d239dfd49c5dfac4b0e9262df10a199c383e22..0000000000000000000000000000000000000000 --- a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_70B/epoch0/rank-specific-00001-of-00008.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:88973b3c418b507bcde1467ec3902218b83d95fe4e022aca11b09c3f86cde7ac -size 537 diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_70B/epoch0/rank-specific-00002-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_70B/epoch0/rank-specific-00002-of-00008.pth deleted file mode 100644 index 44d15a9615f46731b4d1be2302ed11c2e22c5889..0000000000000000000000000000000000000000 --- a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_70B/epoch0/rank-specific-00002-of-00008.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:eee15a274ea5f27c0360c85bd878d6e0f2072076cae26311c52798f7d836643a -size 537 diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_70B/epoch0/rank-specific-00003-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_70B/epoch0/rank-specific-00003-of-00008.pth deleted file mode 100644 index c02a05b764b46a3e2ea7f50bab8449d0128a76d9..0000000000000000000000000000000000000000 --- a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_70B/epoch0/rank-specific-00003-of-00008.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:61651d612914693bf494e5609388a6f9239090c45b3abcc9c4fa5c7a814c7a7e -size 537 diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_70B/epoch0/rank-specific-00004-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_70B/epoch0/rank-specific-00004-of-00008.pth deleted file mode 100644 index f9bdc7b095dfaed08b7ebb500fa76f2562a86c2c..0000000000000000000000000000000000000000 --- a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_70B/epoch0/rank-specific-00004-of-00008.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:8cd6ad8f3d2bcfa25c957717227143e64751970f9b367b28b205a5084a8f476a -size 537 diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_70B/epoch0/rank-specific-00005-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_70B/epoch0/rank-specific-00005-of-00008.pth deleted file mode 100644 index 93470a083d27c6e079dfb735e0a4fa8b7f6b0249..0000000000000000000000000000000000000000 --- a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_70B/epoch0/rank-specific-00005-of-00008.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:bf049e1944a87da00e6860d1884d0eb312dc5a389a832a4e76a582493ec26972 -size 537 diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_70B/epoch0/rank-specific-00006-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_70B/epoch0/rank-specific-00006-of-00008.pth deleted file mode 100644 index 90e3ca8659ab49b709193c41ea8923e9f7217d09..0000000000000000000000000000000000000000 --- a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_70B/epoch0/rank-specific-00006-of-00008.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:8174e84cf8a0553f73baf42bd13d65974b85944a834fa7f75433c0be044e2f04 -size 537 diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_70B/epoch0/rank-specific-00007-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_70B/epoch0/rank-specific-00007-of-00008.pth deleted file mode 100644 index 6530350b10d02e206562d6d0b29a46a26d742899..0000000000000000000000000000000000000000 --- a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_70B/epoch0/rank-specific-00007-of-00008.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:fb6f9198ace60febfc0ad5d85588a3d4021799762f521c1a6b87adc99c8889ce -size 537 diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_70B/epoch1/consolidated.00-of-01.model.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_70B/epoch1/consolidated.00-of-01.model.pth deleted file mode 100644 index 4463edcde807cbd2d66b1213e642cdb8b2640c22..0000000000000000000000000000000000000000 --- a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_70B/epoch1/consolidated.00-of-01.model.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:6b4dde1230986c4e5b21b516ceed0f0cf400e13c18d318e74a21cd0e791b0de0 -size 297203748 diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_70B/epoch1/consolidated.00-of-01.optimizer.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_70B/epoch1/consolidated.00-of-01.optimizer.pth deleted file mode 100644 index f47a343b9d45db77dfbdfe7e306f895c43a552fa..0000000000000000000000000000000000000000 --- a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_70B/epoch1/consolidated.00-of-01.optimizer.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:61f0a81d25eacf47d9d027a432e926afe87659daf833caaf6d7942497739b40e -size 652420087 diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_70B/epoch1/consolidated.00-of-01.other.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_70B/epoch1/consolidated.00-of-01.other.pth deleted file mode 100644 index dd02f4389f04e675f3099c469df456d56089cd48..0000000000000000000000000000000000000000 --- a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_70B/epoch1/consolidated.00-of-01.other.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:96368f4fb4ce9d163fe1194a6040a4cdcf0e6cddb3d702548f866177a4ce8d80 -size 1751 diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_70B/epoch1/rank-specific-00000-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_70B/epoch1/rank-specific-00000-of-00008.pth deleted file mode 100644 index 52b71af1a9ce3ed182e1185cac54dc42f12a5fb6..0000000000000000000000000000000000000000 --- a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_70B/epoch1/rank-specific-00000-of-00008.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:ec2932635da1a4de71c34aa8fcbcba91dfb0ac1ddc7859f8f87280546b7e786a -size 537 diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_70B/epoch1/rank-specific-00001-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_70B/epoch1/rank-specific-00001-of-00008.pth deleted file mode 100644 index 20d239dfd49c5dfac4b0e9262df10a199c383e22..0000000000000000000000000000000000000000 --- a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_70B/epoch1/rank-specific-00001-of-00008.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:88973b3c418b507bcde1467ec3902218b83d95fe4e022aca11b09c3f86cde7ac -size 537 diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_70B/epoch1/rank-specific-00002-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_70B/epoch1/rank-specific-00002-of-00008.pth deleted file mode 100644 index 44d15a9615f46731b4d1be2302ed11c2e22c5889..0000000000000000000000000000000000000000 --- a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_70B/epoch1/rank-specific-00002-of-00008.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:eee15a274ea5f27c0360c85bd878d6e0f2072076cae26311c52798f7d836643a -size 537 diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_70B/epoch1/rank-specific-00003-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_70B/epoch1/rank-specific-00003-of-00008.pth deleted file mode 100644 index c02a05b764b46a3e2ea7f50bab8449d0128a76d9..0000000000000000000000000000000000000000 --- a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_70B/epoch1/rank-specific-00003-of-00008.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:61651d612914693bf494e5609388a6f9239090c45b3abcc9c4fa5c7a814c7a7e -size 537 diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_70B/epoch1/rank-specific-00004-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_70B/epoch1/rank-specific-00004-of-00008.pth deleted file mode 100644 index f9bdc7b095dfaed08b7ebb500fa76f2562a86c2c..0000000000000000000000000000000000000000 --- a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_70B/epoch1/rank-specific-00004-of-00008.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:8cd6ad8f3d2bcfa25c957717227143e64751970f9b367b28b205a5084a8f476a -size 537 diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_70B/epoch1/rank-specific-00005-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_70B/epoch1/rank-specific-00005-of-00008.pth deleted file mode 100644 index 93470a083d27c6e079dfb735e0a4fa8b7f6b0249..0000000000000000000000000000000000000000 --- a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_70B/epoch1/rank-specific-00005-of-00008.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:bf049e1944a87da00e6860d1884d0eb312dc5a389a832a4e76a582493ec26972 -size 537 diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_70B/epoch1/rank-specific-00006-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_70B/epoch1/rank-specific-00006-of-00008.pth deleted file mode 100644 index 90e3ca8659ab49b709193c41ea8923e9f7217d09..0000000000000000000000000000000000000000 --- a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_70B/epoch1/rank-specific-00006-of-00008.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:8174e84cf8a0553f73baf42bd13d65974b85944a834fa7f75433c0be044e2f04 -size 537 diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_70B/epoch1/rank-specific-00007-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_70B/epoch1/rank-specific-00007-of-00008.pth deleted file mode 100644 index 6530350b10d02e206562d6d0b29a46a26d742899..0000000000000000000000000000000000000000 --- a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_70B/epoch1/rank-specific-00007-of-00008.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:fb6f9198ace60febfc0ad5d85588a3d4021799762f521c1a6b87adc99c8889ce -size 537 diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_70B/epoch2/consolidated.00-of-01.model.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_70B/epoch2/consolidated.00-of-01.model.pth deleted file mode 100644 index 88d2ebd8a8a2bff01be665aa462b2c73c92f9645..0000000000000000000000000000000000000000 --- a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_70B/epoch2/consolidated.00-of-01.model.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:3ff2e36a1e758f232f7ad1986a7524026242d1aab6a62cea8587e04b70019032 -size 297203748 diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_70B/epoch2/consolidated.00-of-01.optimizer.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_70B/epoch2/consolidated.00-of-01.optimizer.pth deleted file mode 100644 index c4c942c5ee3f52aff040847a460a5ba07349c40e..0000000000000000000000000000000000000000 --- a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_70B/epoch2/consolidated.00-of-01.optimizer.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:5b46b67179745b5e1b07f2b8b7601c259152124a2ea43ce07085fd617858c399 -size 652420087 diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_70B/epoch2/consolidated.00-of-01.other.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_70B/epoch2/consolidated.00-of-01.other.pth deleted file mode 100644 index 474c44e5c7323224da2ba5781ff483015e0a62e7..0000000000000000000000000000000000000000 --- a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_70B/epoch2/consolidated.00-of-01.other.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:307dbfce529fa5fb62614ed0ef32f8a5808cf46d71b5204dd27bb6d5866f4280 -size 1751 diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_70B/epoch2/rank-specific-00000-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_70B/epoch2/rank-specific-00000-of-00008.pth deleted file mode 100644 index 52b71af1a9ce3ed182e1185cac54dc42f12a5fb6..0000000000000000000000000000000000000000 --- a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_70B/epoch2/rank-specific-00000-of-00008.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:ec2932635da1a4de71c34aa8fcbcba91dfb0ac1ddc7859f8f87280546b7e786a -size 537 diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_70B/epoch2/rank-specific-00001-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_70B/epoch2/rank-specific-00001-of-00008.pth deleted file mode 100644 index 20d239dfd49c5dfac4b0e9262df10a199c383e22..0000000000000000000000000000000000000000 --- a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_70B/epoch2/rank-specific-00001-of-00008.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:88973b3c418b507bcde1467ec3902218b83d95fe4e022aca11b09c3f86cde7ac -size 537 diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_70B/epoch2/rank-specific-00002-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_70B/epoch2/rank-specific-00002-of-00008.pth deleted file mode 100644 index 44d15a9615f46731b4d1be2302ed11c2e22c5889..0000000000000000000000000000000000000000 --- a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_70B/epoch2/rank-specific-00002-of-00008.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:eee15a274ea5f27c0360c85bd878d6e0f2072076cae26311c52798f7d836643a -size 537 diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_70B/epoch2/rank-specific-00003-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_70B/epoch2/rank-specific-00003-of-00008.pth deleted file mode 100644 index c02a05b764b46a3e2ea7f50bab8449d0128a76d9..0000000000000000000000000000000000000000 --- a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_70B/epoch2/rank-specific-00003-of-00008.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:61651d612914693bf494e5609388a6f9239090c45b3abcc9c4fa5c7a814c7a7e -size 537 diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_70B/epoch2/rank-specific-00004-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_70B/epoch2/rank-specific-00004-of-00008.pth deleted file mode 100644 index f9bdc7b095dfaed08b7ebb500fa76f2562a86c2c..0000000000000000000000000000000000000000 --- a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_70B/epoch2/rank-specific-00004-of-00008.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:8cd6ad8f3d2bcfa25c957717227143e64751970f9b367b28b205a5084a8f476a -size 537 diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_70B/epoch2/rank-specific-00005-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_70B/epoch2/rank-specific-00005-of-00008.pth deleted file mode 100644 index 93470a083d27c6e079dfb735e0a4fa8b7f6b0249..0000000000000000000000000000000000000000 --- a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_70B/epoch2/rank-specific-00005-of-00008.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:bf049e1944a87da00e6860d1884d0eb312dc5a389a832a4e76a582493ec26972 -size 537 diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_70B/epoch2/rank-specific-00006-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_70B/epoch2/rank-specific-00006-of-00008.pth deleted file mode 100644 index 90e3ca8659ab49b709193c41ea8923e9f7217d09..0000000000000000000000000000000000000000 --- a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_70B/epoch2/rank-specific-00006-of-00008.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:8174e84cf8a0553f73baf42bd13d65974b85944a834fa7f75433c0be044e2f04 -size 537 diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_70B/epoch2/rank-specific-00007-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_70B/epoch2/rank-specific-00007-of-00008.pth deleted file mode 100644 index 6530350b10d02e206562d6d0b29a46a26d742899..0000000000000000000000000000000000000000 --- a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_70B/epoch2/rank-specific-00007-of-00008.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:fb6f9198ace60febfc0ad5d85588a3d4021799762f521c1a6b87adc99c8889ce -size 537 diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_70B/epoch3/consolidated.00-of-01.model.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_70B/epoch3/consolidated.00-of-01.model.pth deleted file mode 100644 index d7909a0119bd70774cf2ce3e76131c85511829fe..0000000000000000000000000000000000000000 --- a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_70B/epoch3/consolidated.00-of-01.model.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:0d35ec2de84035df505ee9afdfda40ee31647126e02c67504a29af31edb2f3f1 -size 297203748 diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_70B/epoch3/consolidated.00-of-01.optimizer.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_70B/epoch3/consolidated.00-of-01.optimizer.pth deleted file mode 100644 index 9eadcfb391712aa0bcfa7243d2d77663e6012a6c..0000000000000000000000000000000000000000 --- a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_70B/epoch3/consolidated.00-of-01.optimizer.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:b63a86201d584128bf1d9add0a1c18fb2b8470ec81839c424b98d7009b70a948 -size 652420087 diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_70B/epoch3/consolidated.00-of-01.other.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_70B/epoch3/consolidated.00-of-01.other.pth deleted file mode 100644 index 42030546a60a52761cc8a5b9ff9915c39c594e60..0000000000000000000000000000000000000000 --- a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_70B/epoch3/consolidated.00-of-01.other.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:e7c4e36a1edeada21386a34ca7b8f14e4c60081320427fd35a68427cd71307f1 -size 1751 diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_70B/epoch3/rank-specific-00000-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_70B/epoch3/rank-specific-00000-of-00008.pth deleted file mode 100644 index 52b71af1a9ce3ed182e1185cac54dc42f12a5fb6..0000000000000000000000000000000000000000 --- a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_70B/epoch3/rank-specific-00000-of-00008.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:ec2932635da1a4de71c34aa8fcbcba91dfb0ac1ddc7859f8f87280546b7e786a -size 537 diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_70B/epoch3/rank-specific-00001-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_70B/epoch3/rank-specific-00001-of-00008.pth deleted file mode 100644 index 20d239dfd49c5dfac4b0e9262df10a199c383e22..0000000000000000000000000000000000000000 --- a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_70B/epoch3/rank-specific-00001-of-00008.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:88973b3c418b507bcde1467ec3902218b83d95fe4e022aca11b09c3f86cde7ac -size 537 diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_70B/epoch3/rank-specific-00002-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_70B/epoch3/rank-specific-00002-of-00008.pth deleted file mode 100644 index 44d15a9615f46731b4d1be2302ed11c2e22c5889..0000000000000000000000000000000000000000 --- a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_70B/epoch3/rank-specific-00002-of-00008.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:eee15a274ea5f27c0360c85bd878d6e0f2072076cae26311c52798f7d836643a -size 537 diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_70B/epoch3/rank-specific-00003-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_70B/epoch3/rank-specific-00003-of-00008.pth deleted file mode 100644 index c02a05b764b46a3e2ea7f50bab8449d0128a76d9..0000000000000000000000000000000000000000 --- a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_70B/epoch3/rank-specific-00003-of-00008.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:61651d612914693bf494e5609388a6f9239090c45b3abcc9c4fa5c7a814c7a7e -size 537 diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_70B/epoch3/rank-specific-00004-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_70B/epoch3/rank-specific-00004-of-00008.pth deleted file mode 100644 index f9bdc7b095dfaed08b7ebb500fa76f2562a86c2c..0000000000000000000000000000000000000000 --- a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_70B/epoch3/rank-specific-00004-of-00008.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:8cd6ad8f3d2bcfa25c957717227143e64751970f9b367b28b205a5084a8f476a -size 537 diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_70B/epoch3/rank-specific-00005-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_70B/epoch3/rank-specific-00005-of-00008.pth deleted file mode 100644 index 93470a083d27c6e079dfb735e0a4fa8b7f6b0249..0000000000000000000000000000000000000000 --- a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_70B/epoch3/rank-specific-00005-of-00008.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:bf049e1944a87da00e6860d1884d0eb312dc5a389a832a4e76a582493ec26972 -size 537 diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_70B/epoch3/rank-specific-00006-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_70B/epoch3/rank-specific-00006-of-00008.pth deleted file mode 100644 index 90e3ca8659ab49b709193c41ea8923e9f7217d09..0000000000000000000000000000000000000000 --- a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_70B/epoch3/rank-specific-00006-of-00008.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:8174e84cf8a0553f73baf42bd13d65974b85944a834fa7f75433c0be044e2f04 -size 537 diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_70B/epoch3/rank-specific-00007-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_70B/epoch3/rank-specific-00007-of-00008.pth deleted file mode 100644 index 6530350b10d02e206562d6d0b29a46a26d742899..0000000000000000000000000000000000000000 --- a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_70B/epoch3/rank-specific-00007-of-00008.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:fb6f9198ace60febfc0ad5d85588a3d4021799762f521c1a6b87adc99c8889ce -size 537 diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_70B/log.txt b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_70B/log.txt deleted file mode 100644 index 33ab444b7249f1f75e28c0dca912c7da6a1af2a8..0000000000000000000000000000000000000000 --- a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_70B/log.txt +++ /dev/null @@ -1,4 +0,0 @@ -{"train_lr": 2.49692118226601e-05, "train_closs": 0.8056433233203082, "train_grad_norm": 0.5908145041668357, "epoch": 0, "val_lr": 2.49692118226601e-05, "val_closs": 0.8056433233203082, "val_grad_norm": 0.5908145041668357} -{"train_lr": 4.611427498140404e-05, "train_closs": 0.7642887281919164, "train_grad_norm": 0.44820075342661053, "epoch": 1, "val_lr": 4.611427498140404e-05, "val_closs": 0.7642887281919164, "val_grad_norm": 0.44820075342661053} -{"train_lr": 2.7513854679802933e-05, "train_closs": 0.750088836140403, "train_grad_norm": 0.4348251323027564, "epoch": 2, "val_lr": 2.7513854679802933e-05, "val_closs": 0.750088836140403, "val_grad_norm": 0.4348251323027564} -{"train_lr": 8.899579698398978e-06, "train_closs": 0.7437510005037272, "train_grad_norm": 0.4512289605410815, "epoch": 3, "val_lr": 8.899579698398978e-06, "val_closs": 0.7437510005037272, "val_grad_norm": 0.4512289605410815} diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_70B/output.log b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_70B/output.log deleted file mode 100644 index d559bfc8035e79f24712ad17b31daf674c71e6b6..0000000000000000000000000000000000000000 --- a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_70B/output.log +++ /dev/null @@ -1,4306 +0,0 @@ -WARNING:torch.distributed.run: -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -| distributed init (rank 5): env://, gpu 5 -| distributed init (rank 2): env://, gpu 2 -| distributed init (rank 1): env://, gpu 1 -| distributed init (rank 0): env://, gpu 0 -| distributed init (rank 4): env://, gpu 4 -| distributed init (rank 7): env://, gpu 7 -| distributed init (rank 3): env://, gpu 3 -| distributed init (rank 6): env://, gpu 6 -[19:35:45.655312] > initializing model parallel with size 1 -[19:35:45.655420] > initializing ddp with size 8 -[19:35:45.655428] > initializing pipeline with size 1 -[19:35:45.813631] job dir: /data/liuyijiang/mmlab/krisliu/LLaMA2-Accessory/accessory -[19:35:45.813780] Namespace(batch_size=4, -accum_iter=2, -llama_type='llama_peft', -llama_config=['../checkpoints/llama2/Llama-2-70b/params.json', -'configs/model/finetune/sg/llamaPeft_normBiasLora.json'], -no_visual=True, -tokenizer_path='../checkpoints/llama2/Llama-2-70b/tokenizer.model', -pretrained_path='../checkpoints/llama2/Llama-2-70b/', -pretrained_type='meta_ori', -weight_decay=0.02, -lr=5e-05, -min_lr=5e-06, -epochs=4, -warmup_epochs=1.0, -clip_grad=2, -max_words=512, -dialog=False, -data_config='configs/data/finetune/sg/alpaca.yaml', -output_dir='output/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_70B', -log_dir='./output_dir', -save_interval=1, -device='cuda', -seed=0, -resume='', -num_workers=8, -pin_mem=True, -world_size=8, -local_rank=-1, -dist_on_itp=False, -dist_url='env://', -model_parallel_size=1, -data_parallel='sdp', -precision='bf16', -checkpointing=True, -quant=True, -rank=0, -gpu=0, -distributed=True, -dist_backend='nccl') -[19:35:45.815488] Start initialization. -[19:35:45.815557] ## Processing on RANK 0. -[19:35:45.827930] Model Args: - ModelArgs(dim=8192, n_layers=80, n_heads=64, n_kv_heads=8, vocab_size=32000, multiple_of=4096, ffn_dim_multiplier=1.3, norm_eps=1e-05, max_batch_size=32, max_seq_len=512, lora_rank=16, bias_tuning=True) -[19:44:02.506035] Model is Peft: True -[19:44:02.522665] Trainable parameter count : 215130112 (local rank), 215130112 (all). -[19:44:02.880306] ## Load pretrained from ../checkpoints/llama2/Llama-2-70b/ -[19:46:53.015413] ## Quantizing model to 4bit! - -Qunatization Process: 0%| | 0/2087 [00:00 -[22:25:50.178814] Start training for 4 epochs -[22:25:50.199290] log_dir: ./output_dir -[22:26:01.325300] Epoch: [0] [0/1624] lr: 0.000000 closs: 0.8540 (0.8540) time: 11.1254 data: 2.5971 max mem: 55406 -[22:27:01.940245] Epoch: [0] [10/1624] lr: 0.000000 closs: 1.0012 (1.0038) grad_norm: 1.2063 (1.2208) time: 6.5217 data: 0.2363 max mem: 71823 -[22:28:02.438780] Epoch: [0] [20/1624] lr: 0.000001 closs: 1.0097 (1.0247) grad_norm: 1.2104 (1.2608) time: 6.0555 data: 0.0002 max mem: 71823 -[22:29:02.950221] Epoch: [0] [30/1624] lr: 0.000001 closs: 0.9723 (1.0132) grad_norm: 1.2321 (1.2460) time: 6.0504 data: 0.0002 max mem: 71823 -[22:30:03.364521] Epoch: [0] [40/1624] lr: 0.000001 closs: 0.9371 (1.0006) grad_norm: 1.2269 (1.2441) time: 6.0462 data: 0.0002 max mem: 71823 -[22:31:03.870323] Epoch: [0] [50/1624] lr: 0.000002 closs: 0.9668 (0.9965) grad_norm: 1.2104 (1.2135) time: 6.0459 data: 0.0002 max mem: 71823 -[22:32:04.351639] Epoch: [0] [60/1624] lr: 0.000002 closs: 0.9717 (0.9953) grad_norm: 1.2251 (1.2350) time: 6.0492 data: 0.0002 max mem: 71823 -[22:33:04.758111] Epoch: [0] [70/1624] lr: 0.000002 closs: 0.9614 (0.9916) grad_norm: 1.2269 (1.2417) time: 6.0443 data: 0.0002 max mem: 71823 -[22:34:05.226985] Epoch: [0] [80/1624] lr: 0.000002 closs: 0.9478 (0.9895) grad_norm: 1.2033 (1.2314) time: 6.0436 data: 0.0002 max mem: 71823 -[22:35:05.576989] Epoch: [0] [90/1624] lr: 0.000003 closs: 0.9776 (0.9984) grad_norm: 1.2512 (1.2387) time: 6.0408 data: 0.0002 max mem: 71823 -[22:36:06.189025] Epoch: [0] [100/1624] lr: 0.000003 closs: 0.9776 (0.9902) grad_norm: 1.2033 (1.2215) time: 6.0480 data: 0.0002 max mem: 71823 -[22:37:06.576816] Epoch: [0] [110/1624] lr: 0.000003 closs: 0.8661 (0.9863) grad_norm: 1.1610 (1.2106) time: 6.0499 data: 0.0002 max mem: 71823 -[22:38:07.110900] Epoch: [0] [120/1624] lr: 0.000004 closs: 0.8741 (0.9759) grad_norm: 1.0719 (1.1981) time: 6.0460 data: 0.0002 max mem: 71823 -[22:39:07.486418] Epoch: [0] [130/1624] lr: 0.000004 closs: 0.8823 (0.9698) grad_norm: 0.9958 (1.1902) time: 6.0454 data: 0.0002 max mem: 71823 -[22:40:08.130888] Epoch: [0] [140/1624] lr: 0.000004 closs: 0.9112 (0.9670) grad_norm: 0.9199 (1.1644) time: 6.0509 data: 0.0002 max mem: 71823 -[22:41:08.533382] Epoch: [0] [150/1624] lr: 0.000005 closs: 0.9242 (0.9628) grad_norm: 0.8800 (1.1345) time: 6.0522 data: 0.0002 max mem: 71823 -[22:42:09.034717] Epoch: [0] [160/1624] lr: 0.000005 closs: 0.8912 (0.9579) grad_norm: 0.7506 (1.1147) time: 6.0451 data: 0.0002 max mem: 71823 -[22:43:09.518071] Epoch: [0] [170/1624] lr: 0.000005 closs: 0.8735 (0.9534) grad_norm: 0.7081 (1.0861) time: 6.0491 data: 0.0002 max mem: 71823 -[22:44:10.028284] Epoch: [0] [180/1624] lr: 0.000006 closs: 0.8735 (0.9524) grad_norm: 0.6740 (1.0665) time: 6.0495 data: 0.0002 max mem: 71823 -[22:45:10.415137] Epoch: [0] [190/1624] lr: 0.000006 closs: 0.8920 (0.9477) grad_norm: 0.6685 (1.0480) time: 6.0447 data: 0.0002 max mem: 71823 -[22:46:10.980753] Epoch: [0] [200/1624] lr: 0.000006 closs: 0.8385 (0.9430) grad_norm: 0.6200 (1.0273) time: 6.0475 data: 0.0002 max mem: 71823 -[22:47:11.475388] Epoch: [0] [210/1624] lr: 0.000006 closs: 0.8197 (0.9336) grad_norm: 0.6226 (1.0074) time: 6.0529 data: 0.0002 max mem: 71823 -[22:48:12.204068] Epoch: [0] [220/1624] lr: 0.000007 closs: 0.8142 (0.9318) grad_norm: 0.6504 (0.9950) time: 6.0610 data: 0.0002 max mem: 71823 -[22:49:12.675260] Epoch: [0] [230/1624] lr: 0.000007 closs: 0.8794 (0.9284) grad_norm: 0.5918 (0.9756) time: 6.0599 data: 0.0002 max mem: 71823 -[22:50:13.099808] Epoch: [0] [240/1624] lr: 0.000007 closs: 0.8639 (0.9274) grad_norm: 0.5790 (0.9596) time: 6.0446 data: 0.0002 max mem: 71823 -[22:51:13.574799] Epoch: [0] [250/1624] lr: 0.000008 closs: 0.8357 (0.9235) grad_norm: 0.5591 (0.9470) time: 6.0448 data: 0.0002 max mem: 71823 -[22:52:14.165879] Epoch: [0] [260/1624] lr: 0.000008 closs: 0.7928 (0.9193) grad_norm: 0.5673 (0.9337) time: 6.0531 data: 0.0004 max mem: 71823 -[22:53:14.572753] Epoch: [0] [270/1624] lr: 0.000008 closs: 0.7678 (0.9129) grad_norm: 0.5673 (0.9215) time: 6.0498 data: 0.0004 max mem: 71823 -[22:54:15.215410] Epoch: [0] [280/1624] lr: 0.000009 closs: 0.7498 (0.9080) grad_norm: 0.5620 (0.9083) time: 6.0523 data: 0.0002 max mem: 71823 -[22:55:15.812359] Epoch: [0] [290/1624] lr: 0.000009 closs: 0.7971 (0.9047) grad_norm: 0.5513 (0.8969) time: 6.0618 data: 0.0002 max mem: 71823 -[22:56:16.387366] Epoch: [0] [300/1624] lr: 0.000009 closs: 0.8420 (0.9030) grad_norm: 0.5513 (0.8880) time: 6.0585 data: 0.0002 max mem: 71823 -[22:57:16.655765] Epoch: [0] [310/1624] lr: 0.000010 closs: 0.8743 (0.9030) grad_norm: 0.5513 (0.8773) time: 6.0420 data: 0.0002 max mem: 71823 -[22:58:17.237196] Epoch: [0] [320/1624] lr: 0.000010 closs: 0.8398 (0.9004) grad_norm: 0.5343 (0.8641) time: 6.0424 data: 0.0002 max mem: 71823 -[22:59:17.737444] Epoch: [0] [330/1624] lr: 0.000010 closs: 0.8053 (0.8976) grad_norm: 0.5343 (0.8554) time: 6.0540 data: 0.0002 max mem: 71823 -[23:00:18.423882] Epoch: [0] [340/1624] lr: 0.000010 closs: 0.8153 (0.8951) grad_norm: 0.5286 (0.8483) time: 6.0592 data: 0.0002 max mem: 71823 -[23:01:18.839000] Epoch: [0] [350/1624] lr: 0.000011 closs: 0.7884 (0.8906) grad_norm: 0.5279 (0.8394) time: 6.0549 data: 0.0002 max mem: 71823 -[23:02:19.460005] Epoch: [0] [360/1624] lr: 0.000011 closs: 0.7884 (0.8875) grad_norm: 0.5553 (0.8307) time: 6.0517 data: 0.0002 max mem: 71823 -[23:03:19.912357] Epoch: [0] [370/1624] lr: 0.000011 closs: 0.7246 (0.8838) grad_norm: 0.5415 (0.8320) time: 6.0535 data: 0.0002 max mem: 71823 -[23:04:20.486087] Epoch: [0] [380/1624] lr: 0.000012 closs: 0.7053 (0.8808) grad_norm: 0.5415 (0.8254) time: 6.0512 data: 0.0002 max mem: 71823 -[23:05:20.870772] Epoch: [0] [390/1624] lr: 0.000012 closs: 0.7642 (0.8791) grad_norm: 0.5406 (0.8183) time: 6.0478 data: 0.0002 max mem: 71823 -[23:06:21.336837] Epoch: [0] [400/1624] lr: 0.000012 closs: 0.8676 (0.8794) grad_norm: 0.5359 (0.8124) time: 6.0424 data: 0.0002 max mem: 71823 -[23:07:21.672481] Epoch: [0] [410/1624] lr: 0.000013 closs: 0.8830 (0.8788) grad_norm: 0.5266 (0.8068) time: 6.0400 data: 0.0002 max mem: 71823 -[23:08:22.247469] Epoch: [0] [420/1624] lr: 0.000013 closs: 0.8277 (0.8764) grad_norm: 0.5204 (0.7992) time: 6.0454 data: 0.0002 max mem: 71823 -[23:09:22.659914] Epoch: [0] [430/1624] lr: 0.000013 closs: 0.7676 (0.8749) grad_norm: 0.5266 (0.7972) time: 6.0492 data: 0.0002 max mem: 71823 -[23:10:23.178064] Epoch: [0] [440/1624] lr: 0.000014 closs: 0.8061 (0.8741) grad_norm: 0.5175 (0.7901) time: 6.0464 data: 0.0002 max mem: 71823 -[23:11:23.623714] Epoch: [0] [450/1624] lr: 0.000014 closs: 0.7633 (0.8713) grad_norm: 0.5049 (0.7839) time: 6.0481 data: 0.0002 max mem: 71823 -[23:12:24.087130] Epoch: [0] [460/1624] lr: 0.000014 closs: 0.7547 (0.8687) grad_norm: 0.5049 (0.7777) time: 6.0453 data: 0.0002 max mem: 71823 -[23:13:24.634326] Epoch: [0] [470/1624] lr: 0.000014 closs: 0.7134 (0.8634) grad_norm: 0.4959 (0.7728) time: 6.0504 data: 0.0002 max mem: 71823 -[23:14:25.055551] Epoch: [0] [480/1624] lr: 0.000015 closs: 0.6520 (0.8602) grad_norm: 0.5022 (0.7679) time: 6.0483 data: 0.0002 max mem: 71823 -[23:15:25.593287] Epoch: [0] [490/1624] lr: 0.000015 closs: 0.7743 (0.8597) grad_norm: 0.4973 (0.7639) time: 6.0478 data: 0.0002 max mem: 71823 -[23:16:26.210980] Epoch: [0] [500/1624] lr: 0.000015 closs: 0.8056 (0.8585) grad_norm: 0.5046 (0.7609) time: 6.0576 data: 0.0002 max mem: 71823 -[23:17:26.665072] Epoch: [0] [510/1624] lr: 0.000016 closs: 0.7739 (0.8566) grad_norm: 0.5280 (0.7770) time: 6.0535 data: 0.0002 max mem: 71823 -[23:18:27.282990] Epoch: [0] [520/1624] lr: 0.000016 closs: 0.7739 (0.8563) grad_norm: 0.5309 (0.7725) time: 6.0535 data: 0.0002 max mem: 71823 -[23:19:27.577804] Epoch: [0] [530/1624] lr: 0.000016 closs: 0.8086 (0.8560) grad_norm: 0.5272 (0.7684) time: 6.0455 data: 0.0002 max mem: 71823 -[23:20:28.166561] Epoch: [0] [540/1624] lr: 0.000017 closs: 0.8167 (0.8544) grad_norm: 0.5260 (0.7642) time: 6.0440 data: 0.0002 max mem: 71823 -[23:21:28.665669] Epoch: [0] [550/1624] lr: 0.000017 closs: 0.7796 (0.8528) grad_norm: 0.5102 (0.7596) time: 6.0543 data: 0.0002 max mem: 71823 -[23:22:29.090855] Epoch: [0] [560/1624] lr: 0.000017 closs: 0.7821 (0.8513) grad_norm: 0.5219 (0.7556) time: 6.0461 data: 0.0003 max mem: 71823 -[23:23:29.370651] Epoch: [0] [570/1624] lr: 0.000018 closs: 0.7949 (0.8493) grad_norm: 0.5102 (0.7515) time: 6.0351 data: 0.0002 max mem: 71823 -[23:24:29.949431] Epoch: [0] [580/1624] lr: 0.000018 closs: 0.6984 (0.8467) grad_norm: 0.4892 (0.7469) time: 6.0428 data: 0.0002 max mem: 71823 -[23:25:30.334929] Epoch: [0] [590/1624] lr: 0.000018 closs: 0.6984 (0.8449) grad_norm: 0.4999 (0.7435) time: 6.0481 data: 0.0002 max mem: 71823 -[23:26:30.910804] Epoch: [0] [600/1624] lr: 0.000018 closs: 0.8107 (0.8438) grad_norm: 0.4999 (0.7398) time: 6.0479 data: 0.0002 max mem: 71823 -[23:27:31.364599] Epoch: [0] [610/1624] lr: 0.000019 closs: 0.8694 (0.8442) grad_norm: 0.5091 (0.7363) time: 6.0514 data: 0.0002 max mem: 71823 -[23:28:31.842008] Epoch: [0] [620/1624] lr: 0.000019 closs: 0.8268 (0.8444) grad_norm: 0.5191 (0.7328) time: 6.0464 data: 0.0002 max mem: 71823 -[23:29:32.376555] Epoch: [0] [630/1624] lr: 0.000019 closs: 0.8268 (0.8441) grad_norm: 0.5233 (0.7297) time: 6.0505 data: 0.0002 max mem: 71823 -[23:30:32.745659] Epoch: [0] [640/1624] lr: 0.000020 closs: 0.8384 (0.8438) grad_norm: 0.4674 (0.7254) time: 6.0450 data: 0.0002 max mem: 71823 -[23:31:33.288711] Epoch: [0] [650/1624] lr: 0.000020 closs: 0.8384 (0.8436) grad_norm: 0.4773 (0.7239) time: 6.0455 data: 0.0002 max mem: 71823 -[23:32:33.963425] Epoch: [0] [660/1624] lr: 0.000020 closs: 0.8172 (0.8427) grad_norm: 0.4674 (0.7198) time: 6.0608 data: 0.0002 max mem: 71823 -[23:33:34.486966] Epoch: [0] [670/1624] lr: 0.000021 closs: 0.7600 (0.8419) grad_norm: 0.4655 (0.7161) time: 6.0598 data: 0.0002 max mem: 71823 -[23:34:34.966708] Epoch: [0] [680/1624] lr: 0.000021 closs: 0.7269 (0.8398) grad_norm: 0.4773 (0.7170) time: 6.0500 data: 0.0002 max mem: 71823 -[23:35:35.370651] Epoch: [0] [690/1624] lr: 0.000021 closs: 0.7387 (0.8400) grad_norm: 0.4681 (0.7134) time: 6.0441 data: 0.0002 max mem: 71823 -[23:36:36.065433] Epoch: [0] [700/1624] lr: 0.000022 closs: 0.7369 (0.8383) grad_norm: 0.4769 (0.7098) time: 6.0548 data: 0.0002 max mem: 71823 -[23:37:36.529279] Epoch: [0] [710/1624] lr: 0.000022 closs: 0.7012 (0.8376) grad_norm: 0.4804 (0.7149) time: 6.0578 data: 0.0002 max mem: 71823 -[23:38:37.065538] Epoch: [0] [720/1624] lr: 0.000022 closs: 0.7388 (0.8365) grad_norm: 0.4804 (0.7124) time: 6.0499 data: 0.0002 max mem: 71823 -[23:39:37.370496] Epoch: [0] [730/1624] lr: 0.000022 closs: 0.7809 (0.8351) grad_norm: 0.5121 (0.7095) time: 6.0419 data: 0.0002 max mem: 71823 -[23:40:38.036640] Epoch: [0] [740/1624] lr: 0.000023 closs: 0.6884 (0.8333) grad_norm: 0.5142 (0.7067) time: 6.0484 data: 0.0002 max mem: 71823 -[23:41:38.580953] Epoch: [0] [750/1624] lr: 0.000023 closs: 0.6721 (0.8326) grad_norm: 0.5014 (0.7038) time: 6.0604 data: 0.0002 max mem: 71823 -[23:42:39.294877] Epoch: [0] [760/1624] lr: 0.000023 closs: 0.7183 (0.8312) grad_norm: 0.4924 (0.7009) time: 6.0628 data: 0.0002 max mem: 71823 -[23:43:39.769468] Epoch: [0] [770/1624] lr: 0.000024 closs: 0.7493 (0.8302) grad_norm: 0.4896 (0.6986) time: 6.0593 data: 0.0002 max mem: 71823 -[23:44:40.302337] Epoch: [0] [780/1624] lr: 0.000024 closs: 0.7579 (0.8294) grad_norm: 0.4847 (0.6951) time: 6.0502 data: 0.0002 max mem: 71823 -[23:45:40.762594] Epoch: [0] [790/1624] lr: 0.000024 closs: 0.7569 (0.8285) grad_norm: 0.4712 (0.6927) time: 6.0495 data: 0.0002 max mem: 71823 -[23:46:41.199321] Epoch: [0] [800/1624] lr: 0.000025 closs: 0.7144 (0.8276) grad_norm: 0.4838 (0.6907) time: 6.0447 data: 0.0002 max mem: 71823 -[23:47:41.566056] Epoch: [0] [810/1624] lr: 0.000025 closs: 0.7986 (0.8274) grad_norm: 0.4838 (0.6886) time: 6.0400 data: 0.0002 max mem: 71823 -[23:48:42.245405] Epoch: [0] [820/1624] lr: 0.000025 closs: 0.7986 (0.8267) grad_norm: 0.4869 (0.6861) time: 6.0521 data: 0.0002 max mem: 71823 -[23:49:42.662326] Epoch: [0] [830/1624] lr: 0.000026 closs: 0.7340 (0.8258) grad_norm: 0.4912 (0.6840) time: 6.0547 data: 0.0002 max mem: 71823 -[23:50:43.257186] Epoch: [0] [840/1624] lr: 0.000026 closs: 0.7994 (0.8254) grad_norm: 0.4937 (0.6820) time: 6.0505 data: 0.0002 max mem: 71823 -[23:51:43.655316] Epoch: [0] [850/1624] lr: 0.000026 closs: 0.7980 (0.8248) grad_norm: 0.4912 (0.6791) time: 6.0495 data: 0.0002 max mem: 71823 -[23:52:44.266502] Epoch: [0] [860/1624] lr: 0.000026 closs: 0.7105 (0.8235) grad_norm: 0.4895 (0.6767) time: 6.0503 data: 0.0002 max mem: 71823 -[23:53:44.800691] Epoch: [0] [870/1624] lr: 0.000027 closs: 0.8095 (0.8231) grad_norm: 0.4816 (0.6784) time: 6.0571 data: 0.0002 max mem: 71823 -[23:54:45.267304] Epoch: [0] [880/1624] lr: 0.000027 closs: 0.8214 (0.8226) grad_norm: 0.4779 (0.6763) time: 6.0499 data: 0.0002 max mem: 71823 -[23:55:45.716504] Epoch: [0] [890/1624] lr: 0.000027 closs: 0.7135 (0.8216) grad_norm: 0.4864 (0.6750) time: 6.0457 data: 0.0002 max mem: 71823 -[23:56:46.160101] Epoch: [0] [900/1624] lr: 0.000028 closs: 0.7135 (0.8212) grad_norm: 0.5017 (0.6731) time: 6.0445 data: 0.0002 max mem: 71823 -[23:57:46.662843] Epoch: [0] [910/1624] lr: 0.000028 closs: 0.7576 (0.8211) grad_norm: 0.4573 (0.6706) time: 6.0472 data: 0.0002 max mem: 71823 -[23:58:47.218755] Epoch: [0] [920/1624] lr: 0.000028 closs: 0.7472 (0.8207) grad_norm: 0.4449 (0.6679) time: 6.0528 data: 0.0002 max mem: 71823 -[23:59:47.712687] Epoch: [0] [930/1624] lr: 0.000029 closs: 0.7851 (0.8209) grad_norm: 0.4445 (0.6664) time: 6.0523 data: 0.0002 max mem: 71823 -[00:00:48.269492] Epoch: [0] [940/1624] lr: 0.000029 closs: 0.8078 (0.8209) grad_norm: 0.4416 (0.6651) time: 6.0524 data: 0.0002 max mem: 71823 -[00:01:48.796436] Epoch: [0] [950/1624] lr: 0.000029 closs: 0.8386 (0.8212) grad_norm: 0.4425 (0.6637) time: 6.0541 data: 0.0002 max mem: 71823 -[00:02:49.347986] Epoch: [0] [960/1624] lr: 0.000030 closs: 0.8276 (0.8212) grad_norm: 0.4732 (0.6618) time: 6.0538 data: 0.0002 max mem: 71823 -[00:03:49.816559] Epoch: [0] [970/1624] lr: 0.000030 closs: 0.7799 (0.8208) grad_norm: 0.4732 (0.6599) time: 6.0509 data: 0.0002 max mem: 71823 -[00:04:50.413099] Epoch: [0] [980/1624] lr: 0.000030 closs: 0.7799 (0.8204) grad_norm: 0.4732 (0.6580) time: 6.0531 data: 0.0002 max mem: 71823 -[00:05:50.860783] Epoch: [0] [990/1624] lr: 0.000030 closs: 0.8153 (0.8206) grad_norm: 0.4778 (0.6562) time: 6.0521 data: 0.0002 max mem: 71823 -[00:06:51.581337] Epoch: [0] [1000/1624] lr: 0.000031 closs: 0.8154 (0.8211) grad_norm: 0.4778 (0.6548) time: 6.0583 data: 0.0002 max mem: 71823 -[00:07:51.957533] Epoch: [0] [1010/1624] lr: 0.000031 closs: 0.7950 (0.8204) grad_norm: 0.4885 (0.6536) time: 6.0547 data: 0.0002 max mem: 71823 -[00:08:52.445747] Epoch: [0] [1020/1624] lr: 0.000031 closs: 0.7651 (0.8199) grad_norm: 0.5143 (0.6530) time: 6.0431 data: 0.0003 max mem: 71823 -[00:09:52.995376] Epoch: [0] [1030/1624] lr: 0.000032 closs: 0.7591 (0.8191) grad_norm: 0.5227 (0.6526) time: 6.0518 data: 0.0003 max mem: 71823 -[00:10:53.622019] Epoch: [0] [1040/1624] lr: 0.000032 closs: 0.7528 (0.8187) grad_norm: 0.5261 (0.6512) time: 6.0587 data: 0.0002 max mem: 71823 -[00:11:53.976986] Epoch: [0] [1050/1624] lr: 0.000032 closs: 0.7528 (0.8189) grad_norm: 0.5117 (0.6496) time: 6.0489 data: 0.0002 max mem: 71823 -[00:12:54.620882] Epoch: [0] [1060/1624] lr: 0.000033 closs: 0.8134 (0.8186) grad_norm: 0.4736 (0.6479) time: 6.0498 data: 0.0002 max mem: 71823 -[00:13:55.063617] Epoch: [0] [1070/1624] lr: 0.000033 closs: 0.8134 (0.8188) grad_norm: 0.4731 (0.6461) time: 6.0542 data: 0.0002 max mem: 71823 -[00:14:55.656892] Epoch: [0] [1080/1624] lr: 0.000033 closs: 0.8299 (0.8192) grad_norm: 0.4561 (0.6441) time: 6.0517 data: 0.0002 max mem: 71823 -[00:15:55.987912] Epoch: [0] [1090/1624] lr: 0.000034 closs: 0.8128 (0.8192) grad_norm: 0.4574 (0.6434) time: 6.0461 data: 0.0002 max mem: 71823 -[00:16:56.601571] Epoch: [0] [1100/1624] lr: 0.000034 closs: 0.7545 (0.8185) grad_norm: 0.4574 (0.6421) time: 6.0471 data: 0.0002 max mem: 71823 -[00:17:57.036343] Epoch: [0] [1110/1624] lr: 0.000034 closs: 0.7949 (0.8184) grad_norm: 0.4476 (0.6400) time: 6.0523 data: 0.0002 max mem: 71823 -[00:18:57.423394] Epoch: [0] [1120/1624] lr: 0.000034 closs: 0.7867 (0.8186) grad_norm: 0.4574 (0.6383) time: 6.0410 data: 0.0002 max mem: 71823 -[00:19:57.977015] Epoch: [0] [1130/1624] lr: 0.000035 closs: 0.7415 (0.8179) grad_norm: 0.4476 (0.6370) time: 6.0469 data: 0.0002 max mem: 71823 -[00:20:58.699927] Epoch: [0] [1140/1624] lr: 0.000035 closs: 0.7653 (0.8178) grad_norm: 0.4532 (0.6365) time: 6.0637 data: 0.0002 max mem: 71823 -[00:21:59.109752] Epoch: [0] [1150/1624] lr: 0.000035 closs: 0.8332 (0.8182) grad_norm: 0.4692 (0.6355) time: 6.0565 data: 0.0002 max mem: 71823 -[00:22:59.668552] Epoch: [0] [1160/1624] lr: 0.000036 closs: 0.7900 (0.8174) grad_norm: 0.4692 (0.6343) time: 6.0483 data: 0.0002 max mem: 71823 -[00:24:00.063419] Epoch: [0] [1170/1624] lr: 0.000036 closs: 0.7013 (0.8170) grad_norm: 0.4633 (0.6327) time: 6.0475 data: 0.0002 max mem: 71823 -[00:25:00.686234] Epoch: [0] [1180/1624] lr: 0.000036 closs: 0.7388 (0.8172) grad_norm: 0.4462 (0.6307) time: 6.0507 data: 0.0002 max mem: 71823 -[00:26:01.043632] Epoch: [0] [1190/1624] lr: 0.000037 closs: 0.7458 (0.8167) grad_norm: 0.4483 (0.6303) time: 6.0489 data: 0.0002 max mem: 71823 -[00:27:01.508176] Epoch: [0] [1200/1624] lr: 0.000037 closs: 0.7851 (0.8166) grad_norm: 0.4404 (0.6286) time: 6.0409 data: 0.0002 max mem: 71823 -[00:28:01.973444] Epoch: [0] [1210/1624] lr: 0.000037 closs: 0.7641 (0.8149) grad_norm: 0.4628 (0.6282) time: 6.0463 data: 0.0002 max mem: 71823 -[00:29:02.420363] Epoch: [0] [1220/1624] lr: 0.000038 closs: 0.6692 (0.8139) grad_norm: 0.4887 (0.6274) time: 6.0454 data: 0.0002 max mem: 71823 -[00:30:02.876103] Epoch: [0] [1230/1624] lr: 0.000038 closs: 0.7347 (0.8139) grad_norm: 0.4628 (0.6262) time: 6.0450 data: 0.0002 max mem: 71823 -[00:31:03.343448] Epoch: [0] [1240/1624] lr: 0.000038 closs: 0.7568 (0.8132) grad_norm: 0.4991 (0.6253) time: 6.0460 data: 0.0002 max mem: 71823 -[00:32:03.870548] Epoch: [0] [1250/1624] lr: 0.000038 closs: 0.7324 (0.8126) grad_norm: 0.4730 (0.6240) time: 6.0496 data: 0.0002 max mem: 71823 -[00:33:04.508350] Epoch: [0] [1260/1624] lr: 0.000039 closs: 0.7553 (0.8119) grad_norm: 0.4730 (0.6229) time: 6.0581 data: 0.0002 max mem: 71823 -[00:34:05.011527] Epoch: [0] [1270/1624] lr: 0.000039 closs: 0.7699 (0.8123) grad_norm: 0.4832 (0.6225) time: 6.0569 data: 0.0002 max mem: 71823 -[00:35:05.523322] Epoch: [0] [1280/1624] lr: 0.000039 closs: 0.7892 (0.8121) grad_norm: 0.4674 (0.6213) time: 6.0506 data: 0.0002 max mem: 71823 -[00:36:05.919592] Epoch: [0] [1290/1624] lr: 0.000040 closs: 0.7237 (0.8120) grad_norm: 0.4521 (0.6199) time: 6.0453 data: 0.0002 max mem: 71823 -[00:37:06.412781] Epoch: [0] [1300/1624] lr: 0.000040 closs: 0.6858 (0.8115) grad_norm: 0.4520 (0.6186) time: 6.0443 data: 0.0002 max mem: 71823 -[00:38:06.873959] Epoch: [0] [1310/1624] lr: 0.000040 closs: 0.7727 (0.8117) grad_norm: 0.4463 (0.6178) time: 6.0476 data: 0.0002 max mem: 71823 -[00:39:07.370423] Epoch: [0] [1320/1624] lr: 0.000041 closs: 0.7232 (0.8108) grad_norm: 0.4450 (0.6168) time: 6.0478 data: 0.0002 max mem: 71823 -[00:40:07.895392] Epoch: [0] [1330/1624] lr: 0.000041 closs: 0.6910 (0.8102) grad_norm: 0.4264 (0.6152) time: 6.0509 data: 0.0002 max mem: 71823 -[00:41:08.460914] Epoch: [0] [1340/1624] lr: 0.000041 closs: 0.7388 (0.8097) grad_norm: 0.4181 (0.6137) time: 6.0544 data: 0.0002 max mem: 71823 -[00:42:08.991942] Epoch: [0] [1350/1624] lr: 0.000042 closs: 0.7582 (0.8095) grad_norm: 0.4181 (0.6132) time: 6.0547 data: 0.0002 max mem: 71823 -[00:43:09.414387] Epoch: [0] [1360/1624] lr: 0.000042 closs: 0.8159 (0.8092) grad_norm: 0.4079 (0.6120) time: 6.0475 data: 0.0002 max mem: 71823 -[00:44:09.927773] Epoch: [0] [1370/1624] lr: 0.000042 closs: 0.7237 (0.8082) grad_norm: 0.4363 (0.6126) time: 6.0467 data: 0.0002 max mem: 71823 -[00:45:10.493209] Epoch: [0] [1380/1624] lr: 0.000042 closs: 0.7237 (0.8080) grad_norm: 0.4577 (0.6112) time: 6.0538 data: 0.0002 max mem: 71823 -[00:46:10.976536] Epoch: [0] [1390/1624] lr: 0.000043 closs: 0.7631 (0.8079) grad_norm: 0.4608 (0.6103) time: 6.0523 data: 0.0002 max mem: 71823 -[00:47:11.398626] Epoch: [0] [1400/1624] lr: 0.000043 closs: 0.8206 (0.8082) grad_norm: 0.4738 (0.6097) time: 6.0451 data: 0.0002 max mem: 71823 -[00:48:11.875397] Epoch: [0] [1410/1624] lr: 0.000043 closs: 0.7574 (0.8077) grad_norm: 0.4577 (0.6086) time: 6.0448 data: 0.0002 max mem: 71823 -[00:49:12.438805] Epoch: [0] [1420/1624] lr: 0.000044 closs: 0.7035 (0.8077) grad_norm: 0.4633 (0.6089) time: 6.0519 data: 0.0002 max mem: 71823 -[00:50:12.888570] Epoch: [0] [1430/1624] lr: 0.000044 closs: 0.7161 (0.8070) grad_norm: 0.4441 (0.6087) time: 6.0505 data: 0.0002 max mem: 71823 -[00:51:13.382681] Epoch: [0] [1440/1624] lr: 0.000044 closs: 0.7382 (0.8068) grad_norm: 0.4478 (0.6080) time: 6.0470 data: 0.0002 max mem: 71823 -[00:52:13.823044] Epoch: [0] [1450/1624] lr: 0.000045 closs: 0.8055 (0.8072) grad_norm: 0.4339 (0.6069) time: 6.0466 data: 0.0002 max mem: 71823 -[00:53:14.275251] Epoch: [0] [1460/1624] lr: 0.000045 closs: 0.8412 (0.8071) grad_norm: 0.4339 (0.6056) time: 6.0445 data: 0.0002 max mem: 71823 -[00:54:14.733571] Epoch: [0] [1470/1624] lr: 0.000045 closs: 0.8392 (0.8069) grad_norm: 0.4419 (0.6047) time: 6.0454 data: 0.0002 max mem: 71823 -[00:55:15.262859] Epoch: [0] [1480/1624] lr: 0.000046 closs: 0.7298 (0.8062) grad_norm: 0.4370 (0.6043) time: 6.0492 data: 0.0002 max mem: 71823 -[00:56:15.668562] Epoch: [0] [1490/1624] lr: 0.000046 closs: 0.7512 (0.8063) grad_norm: 0.4419 (0.6031) time: 6.0466 data: 0.0002 max mem: 71823 -[00:57:16.276983] Epoch: [0] [1500/1624] lr: 0.000046 closs: 0.7684 (0.8061) grad_norm: 0.4482 (0.6021) time: 6.0506 data: 0.0002 max mem: 71823 -[00:58:16.880793] Epoch: [0] [1510/1624] lr: 0.000046 closs: 0.8047 (0.8063) grad_norm: 0.4265 (0.6008) time: 6.0605 data: 0.0002 max mem: 71823 -[00:59:17.405380] Epoch: [0] [1520/1624] lr: 0.000047 closs: 0.8076 (0.8063) grad_norm: 0.4159 (0.5998) time: 6.0563 data: 0.0002 max mem: 71823 -[01:00:17.870037] Epoch: [0] [1530/1624] lr: 0.000047 closs: 0.7752 (0.8063) grad_norm: 0.4480 (0.5990) time: 6.0493 data: 0.0002 max mem: 71823 -[01:01:18.491447] Epoch: [0] [1540/1624] lr: 0.000047 closs: 0.7603 (0.8058) grad_norm: 0.4480 (0.5985) time: 6.0542 data: 0.0002 max mem: 71823 -[01:02:18.934249] Epoch: [0] [1550/1624] lr: 0.000048 closs: 0.7603 (0.8056) grad_norm: 0.4527 (0.5974) time: 6.0531 data: 0.0002 max mem: 71823 -[01:03:19.520583] Epoch: [0] [1560/1624] lr: 0.000048 closs: 0.8045 (0.8057) grad_norm: 0.4439 (0.5965) time: 6.0513 data: 0.0002 max mem: 71823 -[01:04:19.823441] Epoch: [0] [1570/1624] lr: 0.000048 closs: 0.8045 (0.8057) grad_norm: 0.4409 (0.5955) time: 6.0443 data: 0.0002 max mem: 71823 -[01:05:20.351426] Epoch: [0] [1580/1624] lr: 0.000049 closs: 0.7694 (0.8052) grad_norm: 0.4409 (0.5948) time: 6.0414 data: 0.0002 max mem: 71823 -[01:06:20.863079] Epoch: [0] [1590/1624] lr: 0.000049 closs: 0.7413 (0.8051) grad_norm: 0.4409 (0.5936) time: 6.0519 data: 0.0002 max mem: 71823 -[01:07:21.291639] Epoch: [0] [1600/1624] lr: 0.000049 closs: 0.7553 (0.8049) grad_norm: 0.4424 (0.5926) time: 6.0469 data: 0.0002 max mem: 71823 -[01:08:21.722506] Epoch: [0] [1610/1624] lr: 0.000050 closs: 0.7771 (0.8046) grad_norm: 0.4442 (0.5918) time: 6.0428 data: 0.0002 max mem: 71823 -[01:09:22.228989] Epoch: [0] [1620/1624] lr: 0.000050 closs: 0.7632 (0.8043) grad_norm: 0.4442 (0.5913) time: 6.0467 data: 0.0002 max mem: 71823 -[01:09:41.036014] Epoch: [0] Total time: 2:43:50 -[01:09:41.133465] Averaged stats: lr: 0.000050 closs: 0.7484 (0.8056) grad_norm: 0.4430 (0.5908) -/root/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py:1110: UserWarning: ``FullyShardedDataParallel.full_optim_state_dict``is being deprecated and is replaced by ``FullyShardedDataParallel.optim_state_dict``. ``FullyShardedDataParallel.full_optim_state_dict`` may be removed after PyTorch 2.2. - warnings.warn( -/root/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py:1110: UserWarning: ``FullyShardedDataParallel.full_optim_state_dict``is being deprecated and is replaced by ``FullyShardedDataParallel.optim_state_dict``. ``FullyShardedDataParallel.full_optim_state_dict`` may be removed after PyTorch 2.2. - warnings.warn( -/root/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py:1110: UserWarning: ``FullyShardedDataParallel.full_optim_state_dict``is being deprecated and is replaced by ``FullyShardedDataParallel.optim_state_dict``. ``FullyShardedDataParallel.full_optim_state_dict`` may be removed after PyTorch 2.2. - warnings.warn( -/root/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py:1110: UserWarning: ``FullyShardedDataParallel.full_optim_state_dict``is being deprecated and is replaced by ``FullyShardedDataParallel.optim_state_dict``. ``FullyShardedDataParallel.full_optim_state_dict`` may be removed after PyTorch 2.2. - warnings.warn( -/root/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py:1110: UserWarning: ``FullyShardedDataParallel.full_optim_state_dict``is being deprecated and is replaced by ``FullyShardedDataParallel.optim_state_dict``. ``FullyShardedDataParallel.full_optim_state_dict`` may be removed after PyTorch 2.2. - warnings.warn( -/root/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py:1110: UserWarning: ``FullyShardedDataParallel.full_optim_state_dict``is being deprecated and is replaced by ``FullyShardedDataParallel.optim_state_dict``. ``FullyShardedDataParallel.full_optim_state_dict`` may be removed after PyTorch 2.2. - warnings.warn( -/root/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py:1110: UserWarning: ``FullyShardedDataParallel.full_optim_state_dict``is being deprecated and is replaced by ``FullyShardedDataParallel.optim_state_dict``. ``FullyShardedDataParallel.full_optim_state_dict`` may be removed after PyTorch 2.2. - warnings.warn( -[01:09:42.128516] model saved -/root/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py:1110: UserWarning: ``FullyShardedDataParallel.full_optim_state_dict``is being deprecated and is replaced by ``FullyShardedDataParallel.optim_state_dict``. ``FullyShardedDataParallel.full_optim_state_dict`` may be removed after PyTorch 2.2. - warnings.warn( -[01:09:46.524808] optimizer saved -[01:09:46.525661] other rank-common saved -[01:09:46.532741] rank-specific saved -[01:09:46.553147] log_dir: ./output_dir -[01:09:55.112930] Epoch: [1] [0/1624] lr: 0.000050 closs: 0.6580 (0.6580) time: 8.5592 data: 2.5611 max mem: 71823 -[01:10:55.727786] Epoch: [1] [10/1624] lr: 0.000050 closs: 0.7038 (0.6488) grad_norm: 0.4037 (0.4740) time: 6.2884 data: 0.2330 max mem: 71823 -[01:11:56.152613] Epoch: [1] [20/1624] lr: 0.000050 closs: 0.7233 (0.6901) grad_norm: 0.4359 (0.4631) time: 6.0518 data: 0.0002 max mem: 71823 -[01:12:56.579312] Epoch: [1] [30/1624] lr: 0.000050 closs: 0.7405 (0.7317) grad_norm: 0.4359 (0.4567) time: 6.0424 data: 0.0002 max mem: 71823 -[01:13:57.162532] Epoch: [1] [40/1624] lr: 0.000050 closs: 0.7413 (0.7383) grad_norm: 0.4377 (0.4836) time: 6.0504 data: 0.0002 max mem: 71823 -[01:14:57.657870] Epoch: [1] [50/1624] lr: 0.000050 closs: 0.7719 (0.7440) grad_norm: 0.4377 (0.4980) time: 6.0538 data: 0.0002 max mem: 71823 -[01:15:58.117601] Epoch: [1] [60/1624] lr: 0.000050 closs: 0.7719 (0.7421) grad_norm: 0.4303 (0.5216) time: 6.0476 data: 0.0002 max mem: 71823 -[01:16:58.648858] Epoch: [1] [70/1624] lr: 0.000050 closs: 0.7423 (0.7456) grad_norm: 0.4619 (0.5175) time: 6.0494 data: 0.0002 max mem: 71823 -[01:17:59.407864] Epoch: [1] [80/1624] lr: 0.000050 closs: 0.7572 (0.7495) grad_norm: 0.4303 (0.5079) time: 6.0644 data: 0.0002 max mem: 71823 -[01:18:59.859924] Epoch: [1] [90/1624] lr: 0.000050 closs: 0.7572 (0.7518) grad_norm: 0.4138 (0.4961) time: 6.0604 data: 0.0002 max mem: 71823 -[01:20:00.443175] Epoch: [1] [100/1624] lr: 0.000050 closs: 0.7708 (0.7551) grad_norm: 0.4343 (0.4940) time: 6.0516 data: 0.0002 max mem: 71823 -[01:21:00.773749] Epoch: [1] [110/1624] lr: 0.000050 closs: 0.7760 (0.7584) grad_norm: 0.4343 (0.4901) time: 6.0455 data: 0.0002 max mem: 71823 -[01:22:01.402715] Epoch: [1] [120/1624] lr: 0.000050 closs: 0.7536 (0.7585) grad_norm: 0.4182 (0.4860) time: 6.0478 data: 0.0002 max mem: 71823 -[01:23:01.819936] Epoch: [1] [130/1624] lr: 0.000050 closs: 0.7302 (0.7631) grad_norm: 0.4546 (0.4894) time: 6.0522 data: 0.0002 max mem: 71823 -[01:24:02.159204] Epoch: [1] [140/1624] lr: 0.000050 closs: 0.7983 (0.7691) grad_norm: 0.4476 (0.4859) time: 6.0377 data: 0.0002 max mem: 71823 -[01:25:02.681039] Epoch: [1] [150/1624] lr: 0.000050 closs: 0.8091 (0.7707) grad_norm: 0.4402 (0.4841) time: 6.0429 data: 0.0002 max mem: 71823 -[01:26:03.361300] Epoch: [1] [160/1624] lr: 0.000050 closs: 0.7688 (0.7682) grad_norm: 0.4402 (0.4797) time: 6.0600 data: 0.0002 max mem: 71823 -[01:27:03.764437] Epoch: [1] [170/1624] lr: 0.000050 closs: 0.7334 (0.7688) grad_norm: 0.4205 (0.4765) time: 6.0540 data: 0.0002 max mem: 71823 -[01:28:04.412920] Epoch: [1] [180/1624] lr: 0.000050 closs: 0.7812 (0.7712) grad_norm: 0.4129 (0.4742) time: 6.0525 data: 0.0002 max mem: 71823 -[01:29:04.784598] Epoch: [1] [190/1624] lr: 0.000050 closs: 0.7550 (0.7698) grad_norm: 0.4124 (0.4744) time: 6.0509 data: 0.0002 max mem: 71823 -[01:30:05.344411] Epoch: [1] [200/1624] lr: 0.000050 closs: 0.7482 (0.7740) grad_norm: 0.4203 (0.4745) time: 6.0464 data: 0.0002 max mem: 71823 -[01:31:05.720567] Epoch: [1] [210/1624] lr: 0.000050 closs: 0.7566 (0.7725) grad_norm: 0.4255 (0.4731) time: 6.0467 data: 0.0002 max mem: 71823 -[01:32:06.188290] Epoch: [1] [220/1624] lr: 0.000050 closs: 0.7333 (0.7680) grad_norm: 0.4264 (0.4712) time: 6.0421 data: 0.0002 max mem: 71823 -[01:33:06.777809] Epoch: [1] [230/1624] lr: 0.000050 closs: 0.7333 (0.7685) grad_norm: 0.4445 (0.4711) time: 6.0527 data: 0.0002 max mem: 71823 -[01:34:07.454152] Epoch: [1] [240/1624] lr: 0.000050 closs: 0.8581 (0.7716) grad_norm: 0.4232 (0.4706) time: 6.0631 data: 0.0002 max mem: 71823 -[01:35:07.878045] Epoch: [1] [250/1624] lr: 0.000050 closs: 0.8456 (0.7724) grad_norm: 0.4380 (0.4705) time: 6.0549 data: 0.0002 max mem: 71823 -[01:36:08.313878] Epoch: [1] [260/1624] lr: 0.000050 closs: 0.8001 (0.7705) grad_norm: 0.4765 (0.4702) time: 6.0429 data: 0.0002 max mem: 71823 -[01:37:08.897940] Epoch: [1] [270/1624] lr: 0.000050 closs: 0.7580 (0.7703) grad_norm: 0.4364 (0.4684) time: 6.0509 data: 0.0002 max mem: 71823 -[01:38:09.381589] Epoch: [1] [280/1624] lr: 0.000050 closs: 0.7792 (0.7702) grad_norm: 0.4364 (0.4725) time: 6.0533 data: 0.0002 max mem: 71823 -[01:39:09.917941] Epoch: [1] [290/1624] lr: 0.000050 closs: 0.7362 (0.7687) grad_norm: 0.4189 (0.4725) time: 6.0509 data: 0.0002 max mem: 71823 -[01:40:10.377228] Epoch: [1] [300/1624] lr: 0.000050 closs: 0.6970 (0.7688) grad_norm: 0.4097 (0.4729) time: 6.0496 data: 0.0002 max mem: 71823 -[01:41:10.888915] Epoch: [1] [310/1624] lr: 0.000050 closs: 0.8307 (0.7709) grad_norm: 0.4097 (0.4721) time: 6.0484 data: 0.0002 max mem: 71823 -[01:42:11.526255] Epoch: [1] [320/1624] lr: 0.000050 closs: 0.8109 (0.7710) grad_norm: 0.4273 (0.4922) time: 6.0573 data: 0.0002 max mem: 71823 -[01:43:11.842078] Epoch: [1] [330/1624] lr: 0.000049 closs: 0.7901 (0.7731) grad_norm: 0.4273 (0.4907) time: 6.0475 data: 0.0002 max mem: 71823 -[01:44:12.549549] Epoch: [1] [340/1624] lr: 0.000049 closs: 0.7861 (0.7733) grad_norm: 0.4386 (0.4907) time: 6.0510 data: 0.0002 max mem: 71823 -[01:45:12.897484] Epoch: [1] [350/1624] lr: 0.000049 closs: 0.7702 (0.7747) grad_norm: 0.4636 (0.4913) time: 6.0526 data: 0.0002 max mem: 71823 -[01:46:13.442480] Epoch: [1] [360/1624] lr: 0.000049 closs: 0.7744 (0.7745) grad_norm: 0.4636 (0.4904) time: 6.0445 data: 0.0002 max mem: 71823 -[01:47:13.938571] Epoch: [1] [370/1624] lr: 0.000049 closs: 0.7452 (0.7736) grad_norm: 0.4546 (0.4889) time: 6.0519 data: 0.0002 max mem: 71823 -[01:48:14.285944] Epoch: [1] [380/1624] lr: 0.000049 closs: 0.7026 (0.7738) grad_norm: 0.4132 (0.4865) time: 6.0420 data: 0.0002 max mem: 71823 -[01:49:15.026489] Epoch: [1] [390/1624] lr: 0.000049 closs: 0.7788 (0.7741) grad_norm: 0.4063 (0.4859) time: 6.0543 data: 0.0002 max mem: 71823 -[01:50:15.767528] Epoch: [1] [400/1624] lr: 0.000049 closs: 0.7618 (0.7740) grad_norm: 0.4130 (0.4854) time: 6.0739 data: 0.0002 max mem: 71823 -[01:51:16.185310] Epoch: [1] [410/1624] lr: 0.000049 closs: 0.8079 (0.7745) grad_norm: 0.3967 (0.4839) time: 6.0578 data: 0.0002 max mem: 71823 -[01:52:16.665276] Epoch: [1] [420/1624] lr: 0.000049 closs: 0.8101 (0.7756) grad_norm: 0.3940 (0.4817) time: 6.0447 data: 0.0002 max mem: 71823 -[01:53:16.974029] Epoch: [1] [430/1624] lr: 0.000049 closs: 0.7404 (0.7748) grad_norm: 0.3940 (0.4809) time: 6.0393 data: 0.0002 max mem: 71823 -[01:54:17.637878] Epoch: [1] [440/1624] lr: 0.000049 closs: 0.7074 (0.7742) grad_norm: 0.3993 (0.4803) time: 6.0485 data: 0.0002 max mem: 71823 -[01:55:18.094154] Epoch: [1] [450/1624] lr: 0.000049 closs: 0.6877 (0.7722) grad_norm: 0.3993 (0.4778) time: 6.0559 data: 0.0002 max mem: 71823 -[01:56:18.650972] Epoch: [1] [460/1624] lr: 0.000049 closs: 0.7017 (0.7727) grad_norm: 0.4094 (0.4775) time: 6.0505 data: 0.0002 max mem: 71823 -[01:57:19.326707] Epoch: [1] [470/1624] lr: 0.000049 closs: 0.7751 (0.7722) grad_norm: 0.4161 (0.4771) time: 6.0615 data: 0.0002 max mem: 71823 -[01:58:19.939203] Epoch: [1] [480/1624] lr: 0.000049 closs: 0.7489 (0.7715) grad_norm: 0.4154 (0.4758) time: 6.0643 data: 0.0002 max mem: 71823 -[01:59:20.434255] Epoch: [1] [490/1624] lr: 0.000049 closs: 0.7363 (0.7708) grad_norm: 0.4187 (0.4745) time: 6.0553 data: 0.0002 max mem: 71823 -[02:00:21.090304] Epoch: [1] [500/1624] lr: 0.000049 closs: 0.7227 (0.7712) grad_norm: 0.4154 (0.4734) time: 6.0574 data: 0.0002 max mem: 71823 -[02:01:21.396620] Epoch: [1] [510/1624] lr: 0.000049 closs: 0.6991 (0.7713) grad_norm: 0.4162 (0.4733) time: 6.0480 data: 0.0002 max mem: 71823 -[02:02:21.881705] Epoch: [1] [520/1624] lr: 0.000049 closs: 0.7572 (0.7719) grad_norm: 0.4108 (0.4726) time: 6.0394 data: 0.0002 max mem: 71823 -[02:03:22.320430] Epoch: [1] [530/1624] lr: 0.000049 closs: 0.7493 (0.7715) grad_norm: 0.4108 (0.4716) time: 6.0460 data: 0.0002 max mem: 71823 -[02:04:22.772808] Epoch: [1] [540/1624] lr: 0.000049 closs: 0.7810 (0.7728) grad_norm: 0.4276 (0.4721) time: 6.0444 data: 0.0002 max mem: 71823 -[02:05:23.319942] Epoch: [1] [550/1624] lr: 0.000049 closs: 0.7850 (0.7722) grad_norm: 0.4051 (0.4712) time: 6.0498 data: 0.0002 max mem: 71823 -[02:06:23.936162] Epoch: [1] [560/1624] lr: 0.000049 closs: 0.7716 (0.7732) grad_norm: 0.4160 (0.4704) time: 6.0580 data: 0.0002 max mem: 71823 -[02:07:24.397761] Epoch: [1] [570/1624] lr: 0.000048 closs: 0.7684 (0.7737) grad_norm: 0.4172 (0.4703) time: 6.0537 data: 0.0002 max mem: 71823 -[02:08:24.777526] Epoch: [1] [580/1624] lr: 0.000048 closs: 0.7684 (0.7746) grad_norm: 0.4263 (0.4704) time: 6.0419 data: 0.0002 max mem: 71823 -[02:09:25.105370] Epoch: [1] [590/1624] lr: 0.000048 closs: 0.7406 (0.7734) grad_norm: 0.4310 (0.4696) time: 6.0352 data: 0.0002 max mem: 71823 -[02:10:25.695936] Epoch: [1] [600/1624] lr: 0.000048 closs: 0.7406 (0.7728) grad_norm: 0.4350 (0.4696) time: 6.0458 data: 0.0002 max mem: 71823 -[02:11:26.223190] Epoch: [1] [610/1624] lr: 0.000048 closs: 0.7611 (0.7727) grad_norm: 0.4350 (0.4694) time: 6.0558 data: 0.0002 max mem: 71823 -[02:12:26.754096] Epoch: [1] [620/1624] lr: 0.000048 closs: 0.7142 (0.7709) grad_norm: 0.4334 (0.4689) time: 6.0528 data: 0.0002 max mem: 71823 -[02:13:27.446764] Epoch: [1] [630/1624] lr: 0.000048 closs: 0.7626 (0.7718) grad_norm: 0.4243 (0.4689) time: 6.0610 data: 0.0002 max mem: 71823 -[02:14:27.909481] Epoch: [1] [640/1624] lr: 0.000048 closs: 0.7979 (0.7720) grad_norm: 0.4156 (0.4681) time: 6.0576 data: 0.0002 max mem: 71823 -[02:15:28.422464] Epoch: [1] [650/1624] lr: 0.000048 closs: 0.7582 (0.7721) grad_norm: 0.4111 (0.4680) time: 6.0487 data: 0.0002 max mem: 71823 -[02:16:29.036071] Epoch: [1] [660/1624] lr: 0.000048 closs: 0.7194 (0.7714) grad_norm: 0.4084 (0.4688) time: 6.0562 data: 0.0002 max mem: 71823 -[02:17:29.391543] Epoch: [1] [670/1624] lr: 0.000048 closs: 0.6965 (0.7704) grad_norm: 0.4110 (0.4684) time: 6.0483 data: 0.0002 max mem: 71823 -[02:18:29.893005] Epoch: [1] [680/1624] lr: 0.000048 closs: 0.7126 (0.7703) grad_norm: 0.4110 (0.4678) time: 6.0427 data: 0.0002 max mem: 71823 -[02:19:30.283708] Epoch: [1] [690/1624] lr: 0.000048 closs: 0.7796 (0.7707) grad_norm: 0.4073 (0.4669) time: 6.0445 data: 0.0002 max mem: 71823 -[02:20:30.772176] Epoch: [1] [700/1624] lr: 0.000048 closs: 0.7084 (0.7686) grad_norm: 0.4110 (0.4664) time: 6.0438 data: 0.0002 max mem: 71823 -[02:21:31.383337] Epoch: [1] [710/1624] lr: 0.000048 closs: 0.6515 (0.7676) grad_norm: 0.4240 (0.4669) time: 6.0549 data: 0.0002 max mem: 71823 -[02:22:31.981516] Epoch: [1] [720/1624] lr: 0.000048 closs: 0.7414 (0.7682) grad_norm: 0.4167 (0.4664) time: 6.0603 data: 0.0002 max mem: 71823 -[02:23:32.416681] Epoch: [1] [730/1624] lr: 0.000048 closs: 0.7414 (0.7680) grad_norm: 0.4311 (0.4656) time: 6.0515 data: 0.0002 max mem: 71823 -[02:24:32.797816] Epoch: [1] [740/1624] lr: 0.000047 closs: 0.6798 (0.7665) grad_norm: 0.4419 (0.4658) time: 6.0407 data: 0.0002 max mem: 71823 -[02:25:33.105862] Epoch: [1] [750/1624] lr: 0.000047 closs: 0.7164 (0.7662) grad_norm: 0.4243 (0.4649) time: 6.0343 data: 0.0002 max mem: 71823 -[02:26:33.512140] Epoch: [1] [760/1624] lr: 0.000047 closs: 0.7464 (0.7664) grad_norm: 0.4339 (0.4672) time: 6.0356 data: 0.0002 max mem: 71823 -[02:27:34.016407] Epoch: [1] [770/1624] lr: 0.000047 closs: 0.7720 (0.7668) grad_norm: 0.4243 (0.4661) time: 6.0454 data: 0.0002 max mem: 71823 -[02:28:34.528147] Epoch: [1] [780/1624] lr: 0.000047 closs: 0.7679 (0.7665) grad_norm: 0.4088 (0.4666) time: 6.0507 data: 0.0002 max mem: 71823 -[02:29:35.018357] Epoch: [1] [790/1624] lr: 0.000047 closs: 0.7777 (0.7666) grad_norm: 0.4321 (0.4666) time: 6.0500 data: 0.0002 max mem: 71823 -[02:30:35.600335] Epoch: [1] [800/1624] lr: 0.000047 closs: 0.6456 (0.7656) grad_norm: 0.3962 (0.4655) time: 6.0535 data: 0.0002 max mem: 71823 -[02:31:36.010031] Epoch: [1] [810/1624] lr: 0.000047 closs: 0.7167 (0.7653) grad_norm: 0.3962 (0.4650) time: 6.0495 data: 0.0002 max mem: 71823 -[02:32:36.568959] Epoch: [1] [820/1624] lr: 0.000047 closs: 0.7547 (0.7661) grad_norm: 0.3962 (0.4642) time: 6.0483 data: 0.0002 max mem: 71823 -[02:33:36.974952] Epoch: [1] [830/1624] lr: 0.000047 closs: 0.7389 (0.7663) grad_norm: 0.3916 (0.4643) time: 6.0481 data: 0.0002 max mem: 71823 -[02:34:37.407477] Epoch: [1] [840/1624] lr: 0.000047 closs: 0.7576 (0.7663) grad_norm: 0.4037 (0.4638) time: 6.0418 data: 0.0002 max mem: 71823 -[02:35:37.882544] Epoch: [1] [850/1624] lr: 0.000047 closs: 0.7499 (0.7662) grad_norm: 0.4433 (0.4652) time: 6.0453 data: 0.0002 max mem: 71823 -[02:36:38.298273] Epoch: [1] [860/1624] lr: 0.000047 closs: 0.7499 (0.7666) grad_norm: 0.4433 (0.4644) time: 6.0444 data: 0.0002 max mem: 71823 -[02:37:38.877589] Epoch: [1] [870/1624] lr: 0.000047 closs: 0.7872 (0.7670) grad_norm: 0.4185 (0.4641) time: 6.0496 data: 0.0002 max mem: 71823 -[02:38:39.507871] Epoch: [1] [880/1624] lr: 0.000046 closs: 0.7770 (0.7673) grad_norm: 0.4185 (0.4634) time: 6.0604 data: 0.0002 max mem: 71823 -[02:39:39.820467] Epoch: [1] [890/1624] lr: 0.000046 closs: 0.7870 (0.7681) grad_norm: 0.4073 (0.4634) time: 6.0470 data: 0.0002 max mem: 71823 -[02:40:40.446195] Epoch: [1] [900/1624] lr: 0.000046 closs: 0.7773 (0.7678) grad_norm: 0.4185 (0.4629) time: 6.0468 data: 0.0002 max mem: 71823 -[02:41:40.918331] Epoch: [1] [910/1624] lr: 0.000046 closs: 0.7423 (0.7678) grad_norm: 0.4033 (0.4620) time: 6.0548 data: 0.0002 max mem: 71823 -[02:42:41.376369] Epoch: [1] [920/1624] lr: 0.000046 closs: 0.7398 (0.7678) grad_norm: 0.4091 (0.4613) time: 6.0464 data: 0.0002 max mem: 71823 -[02:43:41.789797] Epoch: [1] [930/1624] lr: 0.000046 closs: 0.7445 (0.7676) grad_norm: 0.3993 (0.4604) time: 6.0434 data: 0.0002 max mem: 71823 -[02:44:42.221243] Epoch: [1] [940/1624] lr: 0.000046 closs: 0.7373 (0.7673) grad_norm: 0.3993 (0.4601) time: 6.0421 data: 0.0002 max mem: 71823 -[02:45:42.686795] Epoch: [1] [950/1624] lr: 0.000046 closs: 0.7373 (0.7674) grad_norm: 0.4091 (0.4597) time: 6.0447 data: 0.0002 max mem: 71823 -[02:46:43.260905] Epoch: [1] [960/1624] lr: 0.000046 closs: 0.8081 (0.7683) grad_norm: 0.4213 (0.4619) time: 6.0519 data: 0.0002 max mem: 71823 -[02:47:43.634101] Epoch: [1] [970/1624] lr: 0.000046 closs: 0.7084 (0.7675) grad_norm: 0.4175 (0.4614) time: 6.0472 data: 0.0002 max mem: 71823 -[02:48:44.288682] Epoch: [1] [980/1624] lr: 0.000046 closs: 0.7084 (0.7673) grad_norm: 0.4161 (0.4610) time: 6.0513 data: 0.0002 max mem: 71823 -[02:49:44.723423] Epoch: [1] [990/1624] lr: 0.000046 closs: 0.7345 (0.7667) grad_norm: 0.4161 (0.4604) time: 6.0543 data: 0.0002 max mem: 71823 -[02:50:45.251614] Epoch: [1] [1000/1624] lr: 0.000045 closs: 0.6662 (0.7659) grad_norm: 0.4056 (0.4600) time: 6.0480 data: 0.0002 max mem: 71823 -[02:51:45.756379] Epoch: [1] [1010/1624] lr: 0.000045 closs: 0.7223 (0.7656) grad_norm: 0.4029 (0.4593) time: 6.0515 data: 0.0002 max mem: 71823 -[02:52:46.171547] Epoch: [1] [1020/1624] lr: 0.000045 closs: 0.7570 (0.7658) grad_norm: 0.4056 (0.4592) time: 6.0459 data: 0.0002 max mem: 71823 -[02:53:46.739102] Epoch: [1] [1030/1624] lr: 0.000045 closs: 0.7521 (0.7656) grad_norm: 0.3925 (0.4584) time: 6.0490 data: 0.0002 max mem: 71823 -[02:54:47.258981] Epoch: [1] [1040/1624] lr: 0.000045 closs: 0.7633 (0.7660) grad_norm: 0.3958 (0.4583) time: 6.0542 data: 0.0002 max mem: 71823 -[02:55:47.724350] Epoch: [1] [1050/1624] lr: 0.000045 closs: 0.7518 (0.7654) grad_norm: 0.4028 (0.4580) time: 6.0491 data: 0.0002 max mem: 71823 -[02:56:48.372652] Epoch: [1] [1060/1624] lr: 0.000045 closs: 0.6824 (0.7654) grad_norm: 0.4042 (0.4578) time: 6.0556 data: 0.0002 max mem: 71823 -[02:57:48.855767] Epoch: [1] [1070/1624] lr: 0.000045 closs: 0.7612 (0.7655) grad_norm: 0.4283 (0.4571) time: 6.0564 data: 0.0002 max mem: 71823 -[02:58:49.292004] Epoch: [1] [1080/1624] lr: 0.000045 closs: 0.7750 (0.7658) grad_norm: 0.4156 (0.4567) time: 6.0458 data: 0.0002 max mem: 71823 -[02:59:49.780419] Epoch: [1] [1090/1624] lr: 0.000045 closs: 0.7810 (0.7659) grad_norm: 0.4156 (0.4564) time: 6.0461 data: 0.0002 max mem: 71823 -[03:00:50.300798] Epoch: [1] [1100/1624] lr: 0.000045 closs: 0.7662 (0.7661) grad_norm: 0.3967 (0.4564) time: 6.0503 data: 0.0002 max mem: 71823 -[03:01:50.847453] Epoch: [1] [1110/1624] lr: 0.000044 closs: 0.7962 (0.7664) grad_norm: 0.4156 (0.4562) time: 6.0532 data: 0.0002 max mem: 71823 -[03:02:51.399774] Epoch: [1] [1120/1624] lr: 0.000044 closs: 0.7648 (0.7659) grad_norm: 0.4157 (0.4557) time: 6.0548 data: 0.0002 max mem: 71823 -[03:03:51.768317] Epoch: [1] [1130/1624] lr: 0.000044 closs: 0.6991 (0.7651) grad_norm: 0.4157 (0.4559) time: 6.0459 data: 0.0002 max mem: 71823 -[03:04:52.341519] Epoch: [1] [1140/1624] lr: 0.000044 closs: 0.6699 (0.7646) grad_norm: 0.4017 (0.4552) time: 6.0470 data: 0.0002 max mem: 71823 -[03:05:52.834885] Epoch: [1] [1150/1624] lr: 0.000044 closs: 0.7224 (0.7650) grad_norm: 0.4018 (0.4551) time: 6.0532 data: 0.0002 max mem: 71823 -[03:06:53.284139] Epoch: [1] [1160/1624] lr: 0.000044 closs: 0.7450 (0.7649) grad_norm: 0.4054 (0.4549) time: 6.0470 data: 0.0002 max mem: 71823 -[03:07:53.717241] Epoch: [1] [1170/1624] lr: 0.000044 closs: 0.7708 (0.7654) grad_norm: 0.4000 (0.4548) time: 6.0440 data: 0.0002 max mem: 71823 -[03:08:54.212187] Epoch: [1] [1180/1624] lr: 0.000044 closs: 0.7708 (0.7649) grad_norm: 0.4334 (0.4553) time: 6.0463 data: 0.0002 max mem: 71823 -[03:09:54.716697] Epoch: [1] [1190/1624] lr: 0.000044 closs: 0.6982 (0.7650) grad_norm: 0.4419 (0.4552) time: 6.0499 data: 0.0002 max mem: 71823 -[03:10:55.285025] Epoch: [1] [1200/1624] lr: 0.000044 closs: 0.6802 (0.7644) grad_norm: 0.4144 (0.4544) time: 6.0535 data: 0.0002 max mem: 71823 -[03:11:55.639125] Epoch: [1] [1210/1624] lr: 0.000043 closs: 0.7469 (0.7648) grad_norm: 0.4144 (0.4539) time: 6.0460 data: 0.0002 max mem: 71823 -[03:12:56.290435] Epoch: [1] [1220/1624] lr: 0.000043 closs: 0.7450 (0.7645) grad_norm: 0.3863 (0.4536) time: 6.0501 data: 0.0002 max mem: 71823 -[03:13:56.767091] Epoch: [1] [1230/1624] lr: 0.000043 closs: 0.7045 (0.7644) grad_norm: 0.3781 (0.4530) time: 6.0563 data: 0.0002 max mem: 71823 -[03:14:57.357099] Epoch: [1] [1240/1624] lr: 0.000043 closs: 0.7045 (0.7645) grad_norm: 0.3863 (0.4525) time: 6.0532 data: 0.0002 max mem: 71823 -[03:15:57.953617] Epoch: [1] [1250/1624] lr: 0.000043 closs: 0.7690 (0.7646) grad_norm: 0.3898 (0.4520) time: 6.0592 data: 0.0002 max mem: 71823 -[03:16:58.356419] Epoch: [1] [1260/1624] lr: 0.000043 closs: 0.7753 (0.7650) grad_norm: 0.3907 (0.4518) time: 6.0498 data: 0.0002 max mem: 71823 -[03:17:58.929477] Epoch: [1] [1270/1624] lr: 0.000043 closs: 0.7872 (0.7648) grad_norm: 0.4008 (0.4516) time: 6.0486 data: 0.0002 max mem: 71823 -[03:18:59.489801] Epoch: [1] [1280/1624] lr: 0.000043 closs: 0.7351 (0.7643) grad_norm: 0.4024 (0.4520) time: 6.0565 data: 0.0002 max mem: 71823 -[03:19:59.903937] Epoch: [1] [1290/1624] lr: 0.000043 closs: 0.7351 (0.7644) grad_norm: 0.4220 (0.4535) time: 6.0486 data: 0.0002 max mem: 71823 -[03:21:00.544941] Epoch: [1] [1300/1624] lr: 0.000043 closs: 0.7640 (0.7646) grad_norm: 0.4086 (0.4530) time: 6.0526 data: 0.0002 max mem: 71823 -[03:22:00.985811] Epoch: [1] [1310/1624] lr: 0.000042 closs: 0.7628 (0.7647) grad_norm: 0.4074 (0.4526) time: 6.0540 data: 0.0002 max mem: 71823 -[03:23:01.463960] Epoch: [1] [1320/1624] lr: 0.000042 closs: 0.7713 (0.7653) grad_norm: 0.3965 (0.4523) time: 6.0458 data: 0.0002 max mem: 71823 -[03:24:01.908994] Epoch: [1] [1330/1624] lr: 0.000042 closs: 0.8290 (0.7659) grad_norm: 0.3900 (0.4518) time: 6.0460 data: 0.0002 max mem: 71823 -[03:25:02.228027] Epoch: [1] [1340/1624] lr: 0.000042 closs: 0.8251 (0.7660) grad_norm: 0.3924 (0.4516) time: 6.0381 data: 0.0002 max mem: 71823 -[03:26:02.754515] Epoch: [1] [1350/1624] lr: 0.000042 closs: 0.7638 (0.7657) grad_norm: 0.3918 (0.4519) time: 6.0421 data: 0.0002 max mem: 71823 -[03:27:03.336285] Epoch: [1] [1360/1624] lr: 0.000042 closs: 0.7943 (0.7664) grad_norm: 0.3924 (0.4514) time: 6.0552 data: 0.0002 max mem: 71823 -[03:28:03.742422] Epoch: [1] [1370/1624] lr: 0.000042 closs: 0.8004 (0.7667) grad_norm: 0.4126 (0.4513) time: 6.0492 data: 0.0002 max mem: 71823 -[03:29:04.417551] Epoch: [1] [1380/1624] lr: 0.000042 closs: 0.7422 (0.7666) grad_norm: 0.4126 (0.4513) time: 6.0539 data: 0.0002 max mem: 71823 -[03:30:04.778462] Epoch: [1] [1390/1624] lr: 0.000042 closs: 0.6808 (0.7668) grad_norm: 0.4126 (0.4517) time: 6.0517 data: 0.0002 max mem: 71823 -[03:31:05.294565] Epoch: [1] [1400/1624] lr: 0.000041 closs: 0.7667 (0.7668) grad_norm: 0.4083 (0.4512) time: 6.0437 data: 0.0002 max mem: 71823 -[03:32:05.808172] Epoch: [1] [1410/1624] lr: 0.000041 closs: 0.7643 (0.7669) grad_norm: 0.4165 (0.4512) time: 6.0513 data: 0.0002 max mem: 71823 -[03:33:06.303164] Epoch: [1] [1420/1624] lr: 0.000041 closs: 0.7624 (0.7668) grad_norm: 0.4153 (0.4510) time: 6.0503 data: 0.0002 max mem: 71823 -[03:34:07.018791] Epoch: [1] [1430/1624] lr: 0.000041 closs: 0.7385 (0.7667) grad_norm: 0.4082 (0.4507) time: 6.0604 data: 0.0002 max mem: 71823 -[03:35:07.591938] Epoch: [1] [1440/1624] lr: 0.000041 closs: 0.7821 (0.7670) grad_norm: 0.4153 (0.4503) time: 6.0643 data: 0.0002 max mem: 71823 -[03:36:08.175679] Epoch: [1] [1450/1624] lr: 0.000041 closs: 0.7821 (0.7668) grad_norm: 0.4082 (0.4501) time: 6.0577 data: 0.0002 max mem: 71823 -[03:37:08.743766] Epoch: [1] [1460/1624] lr: 0.000041 closs: 0.8247 (0.7675) grad_norm: 0.4088 (0.4500) time: 6.0575 data: 0.0002 max mem: 71823 -[03:38:09.066520] Epoch: [1] [1470/1624] lr: 0.000041 closs: 0.7716 (0.7675) grad_norm: 0.4088 (0.4497) time: 6.0444 data: 0.0002 max mem: 71823 -[03:39:09.618304] Epoch: [1] [1480/1624] lr: 0.000041 closs: 0.7580 (0.7674) grad_norm: 0.4104 (0.4497) time: 6.0436 data: 0.0002 max mem: 71823 -[03:40:10.102989] Epoch: [1] [1490/1624] lr: 0.000040 closs: 0.7817 (0.7675) grad_norm: 0.4104 (0.4494) time: 6.0517 data: 0.0002 max mem: 71823 -[03:41:10.605937] Epoch: [1] [1500/1624] lr: 0.000040 closs: 0.7682 (0.7675) grad_norm: 0.4078 (0.4493) time: 6.0493 data: 0.0002 max mem: 71823 -[03:42:11.155775] Epoch: [1] [1510/1624] lr: 0.000040 closs: 0.7507 (0.7670) grad_norm: 0.3886 (0.4489) time: 6.0525 data: 0.0002 max mem: 71823 -[03:43:11.769187] Epoch: [1] [1520/1624] lr: 0.000040 closs: 0.7562 (0.7677) grad_norm: 0.3886 (0.4487) time: 6.0580 data: 0.0002 max mem: 71823 -[03:44:12.245921] Epoch: [1] [1530/1624] lr: 0.000040 closs: 0.7825 (0.7675) grad_norm: 0.3888 (0.4484) time: 6.0544 data: 0.0002 max mem: 71823 -[03:45:12.854137] Epoch: [1] [1540/1624] lr: 0.000040 closs: 0.7353 (0.7675) grad_norm: 0.3888 (0.4484) time: 6.0541 data: 0.0002 max mem: 71823 -[03:46:13.261704] Epoch: [1] [1550/1624] lr: 0.000040 closs: 0.6820 (0.7669) grad_norm: 0.3924 (0.4489) time: 6.0507 data: 0.0002 max mem: 71823 -[03:47:13.667897] Epoch: [1] [1560/1624] lr: 0.000040 closs: 0.6973 (0.7673) grad_norm: 0.4154 (0.4490) time: 6.0406 data: 0.0002 max mem: 71823 -[03:48:14.102199] Epoch: [1] [1570/1624] lr: 0.000039 closs: 0.7706 (0.7668) grad_norm: 0.4154 (0.4487) time: 6.0419 data: 0.0002 max mem: 71823 -[03:49:14.433452] Epoch: [1] [1580/1624] lr: 0.000039 closs: 0.7864 (0.7671) grad_norm: 0.4216 (0.4489) time: 6.0381 data: 0.0002 max mem: 71823 -[03:50:15.126773] Epoch: [1] [1590/1624] lr: 0.000039 closs: 0.8283 (0.7679) grad_norm: 0.4218 (0.4486) time: 6.0511 data: 0.0002 max mem: 71823 -[03:51:15.537158] Epoch: [1] [1600/1624] lr: 0.000039 closs: 0.8075 (0.7681) grad_norm: 0.4218 (0.4485) time: 6.0551 data: 0.0002 max mem: 71823 -[03:52:15.961264] Epoch: [1] [1610/1624] lr: 0.000039 closs: 0.7415 (0.7679) grad_norm: 0.4343 (0.4486) time: 6.0416 data: 0.0002 max mem: 71823 -[03:53:16.476522] Epoch: [1] [1620/1624] lr: 0.000039 closs: 0.6575 (0.7669) grad_norm: 0.4115 (0.4482) time: 6.0468 data: 0.0002 max mem: 71823 -[03:53:35.176375] Epoch: [1] Total time: 2:43:48 -[03:53:35.348068] Averaged stats: lr: 0.000039 closs: 0.6905 (0.7643) grad_norm: 0.4187 (0.4482) -[03:53:36.360859] model saved -[03:53:40.966516] optimizer saved -[03:53:40.967427] other rank-common saved -[03:53:40.974609] rank-specific saved -[03:53:40.994887] log_dir: ./output_dir -[03:53:49.633633] Epoch: [2] [0/1624] lr: 0.000039 closs: 0.6567 (0.6567) time: 8.6382 data: 2.5888 max mem: 71823 -[03:54:50.120742] Epoch: [2] [10/1624] lr: 0.000039 closs: 0.7167 (0.7285) grad_norm: 0.3922 (0.4083) time: 6.2840 data: 0.2356 max mem: 71823 -[03:55:50.699064] Epoch: [2] [20/1624] lr: 0.000038 closs: 0.7289 (0.7452) grad_norm: 0.4004 (0.4235) time: 6.0531 data: 0.0002 max mem: 71823 -[03:56:51.155669] Epoch: [2] [30/1624] lr: 0.000038 closs: 0.7485 (0.7367) grad_norm: 0.4238 (0.4811) time: 6.0516 data: 0.0002 max mem: 71823 -[03:57:51.723436] Epoch: [2] [40/1624] lr: 0.000038 closs: 0.7494 (0.7456) grad_norm: 0.4075 (0.4638) time: 6.0511 data: 0.0002 max mem: 71823 -[03:58:52.242442] Epoch: [2] [50/1624] lr: 0.000038 closs: 0.7581 (0.7475) grad_norm: 0.4536 (0.4731) time: 6.0542 data: 0.0002 max mem: 71823 -[03:59:52.635317] Epoch: [2] [60/1624] lr: 0.000038 closs: 0.7812 (0.7563) grad_norm: 0.4536 (0.4648) time: 6.0455 data: 0.0002 max mem: 71823 -[04:00:53.170832] Epoch: [2] [70/1624] lr: 0.000038 closs: 0.7879 (0.7594) grad_norm: 0.4075 (0.4614) time: 6.0463 data: 0.0002 max mem: 71823 -[04:01:53.557352] Epoch: [2] [80/1624] lr: 0.000038 closs: 0.7595 (0.7518) grad_norm: 0.4194 (0.4543) time: 6.0460 data: 0.0002 max mem: 71823 -[04:02:54.000703] Epoch: [2] [90/1624] lr: 0.000038 closs: 0.6843 (0.7451) grad_norm: 0.4083 (0.4529) time: 6.0414 data: 0.0002 max mem: 71823 -[04:03:54.588987] Epoch: [2] [100/1624] lr: 0.000037 closs: 0.6843 (0.7432) grad_norm: 0.4085 (0.4476) time: 6.0514 data: 0.0002 max mem: 71823 -[04:04:54.957259] Epoch: [2] [110/1624] lr: 0.000037 closs: 0.7478 (0.7429) grad_norm: 0.4083 (0.4434) time: 6.0477 data: 0.0002 max mem: 71823 -[04:05:55.346791] Epoch: [2] [120/1624] lr: 0.000037 closs: 0.7573 (0.7465) grad_norm: 0.4026 (0.4396) time: 6.0378 data: 0.0002 max mem: 71823 -[04:06:55.882296] Epoch: [2] [130/1624] lr: 0.000037 closs: 0.7323 (0.7454) grad_norm: 0.4003 (0.4371) time: 6.0461 data: 0.0002 max mem: 71823 -[04:07:56.397038] Epoch: [2] [140/1624] lr: 0.000037 closs: 0.7323 (0.7473) grad_norm: 0.4003 (0.4392) time: 6.0524 data: 0.0002 max mem: 71823 -[04:08:56.864119] Epoch: [2] [150/1624] lr: 0.000037 closs: 0.7608 (0.7498) grad_norm: 0.4026 (0.4405) time: 6.0490 data: 0.0002 max mem: 71823 -[04:09:57.375902] Epoch: [2] [160/1624] lr: 0.000037 closs: 0.7676 (0.7511) grad_norm: 0.3952 (0.4350) time: 6.0488 data: 0.0002 max mem: 71823 -[04:10:57.829533] Epoch: [2] [170/1624] lr: 0.000037 closs: 0.7870 (0.7547) grad_norm: 0.3867 (0.4316) time: 6.0481 data: 0.0002 max mem: 71823 -[04:11:58.225575] Epoch: [2] [180/1624] lr: 0.000036 closs: 0.7395 (0.7554) grad_norm: 0.3813 (0.4342) time: 6.0423 data: 0.0002 max mem: 71823 -[04:12:58.740417] Epoch: [2] [190/1624] lr: 0.000036 closs: 0.7189 (0.7537) grad_norm: 0.3763 (0.4353) time: 6.0454 data: 0.0002 max mem: 71823 -[04:13:59.183835] Epoch: [2] [200/1624] lr: 0.000036 closs: 0.8074 (0.7588) grad_norm: 0.3819 (0.4328) time: 6.0478 data: 0.0002 max mem: 71823 -[04:14:59.719860] Epoch: [2] [210/1624] lr: 0.000036 closs: 0.7833 (0.7562) grad_norm: 0.3895 (0.4335) time: 6.0488 data: 0.0002 max mem: 71823 -[04:16:00.107303] Epoch: [2] [220/1624] lr: 0.000036 closs: 0.7087 (0.7527) grad_norm: 0.4166 (0.4331) time: 6.0460 data: 0.0002 max mem: 71823 -[04:17:00.584270] Epoch: [2] [230/1624] lr: 0.000036 closs: 0.7148 (0.7511) grad_norm: 0.3895 (0.4337) time: 6.0431 data: 0.0002 max mem: 71823 -[04:18:01.133127] Epoch: [2] [240/1624] lr: 0.000036 closs: 0.7619 (0.7516) grad_norm: 0.4153 (0.4345) time: 6.0512 data: 0.0002 max mem: 71823 -[04:19:01.550820] Epoch: [2] [250/1624] lr: 0.000035 closs: 0.7883 (0.7508) grad_norm: 0.3974 (0.4324) time: 6.0482 data: 0.0002 max mem: 71823 -[04:20:01.925857] Epoch: [2] [260/1624] lr: 0.000035 closs: 0.7150 (0.7478) grad_norm: 0.3932 (0.4314) time: 6.0395 data: 0.0002 max mem: 71823 -[04:21:02.285060] Epoch: [2] [270/1624] lr: 0.000035 closs: 0.7476 (0.7509) grad_norm: 0.4124 (0.4314) time: 6.0366 data: 0.0002 max mem: 71823 -[04:22:02.785511] Epoch: [2] [280/1624] lr: 0.000035 closs: 0.7325 (0.7494) grad_norm: 0.4124 (0.4314) time: 6.0428 data: 0.0002 max mem: 71823 -[04:23:03.355525] Epoch: [2] [290/1624] lr: 0.000035 closs: 0.7280 (0.7493) grad_norm: 0.4124 (0.4305) time: 6.0534 data: 0.0002 max mem: 71823 -[04:24:03.809885] Epoch: [2] [300/1624] lr: 0.000035 closs: 0.7409 (0.7499) grad_norm: 0.4129 (0.4305) time: 6.0511 data: 0.0002 max mem: 71823 -[04:25:04.396448] Epoch: [2] [310/1624] lr: 0.000035 closs: 0.7221 (0.7480) grad_norm: 0.4038 (0.4293) time: 6.0519 data: 0.0002 max mem: 71823 -[04:26:04.953008] Epoch: [2] [320/1624] lr: 0.000035 closs: 0.7195 (0.7487) grad_norm: 0.4038 (0.4302) time: 6.0570 data: 0.0002 max mem: 71823 -[04:27:05.356605] Epoch: [2] [330/1624] lr: 0.000034 closs: 0.7437 (0.7483) grad_norm: 0.4129 (0.4304) time: 6.0479 data: 0.0002 max mem: 71823 -[04:28:05.862179] Epoch: [2] [340/1624] lr: 0.000034 closs: 0.7437 (0.7490) grad_norm: 0.4090 (0.4306) time: 6.0453 data: 0.0002 max mem: 71823 -[04:29:06.280267] Epoch: [2] [350/1624] lr: 0.000034 closs: 0.7761 (0.7523) grad_norm: 0.4090 (0.4296) time: 6.0461 data: 0.0002 max mem: 71823 -[04:30:06.813518] Epoch: [2] [360/1624] lr: 0.000034 closs: 0.7418 (0.7511) grad_norm: 0.4021 (0.4285) time: 6.0474 data: 0.0002 max mem: 71823 -[04:31:07.262765] Epoch: [2] [370/1624] lr: 0.000034 closs: 0.7193 (0.7516) grad_norm: 0.3934 (0.4274) time: 6.0490 data: 0.0002 max mem: 71823 -[04:32:07.790019] Epoch: [2] [380/1624] lr: 0.000034 closs: 0.7619 (0.7516) grad_norm: 0.4016 (0.4268) time: 6.0487 data: 0.0002 max mem: 71823 -[04:33:08.319109] Epoch: [2] [390/1624] lr: 0.000034 closs: 0.7667 (0.7534) grad_norm: 0.4058 (0.4272) time: 6.0527 data: 0.0002 max mem: 71823 -[04:34:08.964259] Epoch: [2] [400/1624] lr: 0.000033 closs: 0.7667 (0.7534) grad_norm: 0.4205 (0.4282) time: 6.0586 data: 0.0002 max mem: 71823 -[04:35:09.389267] Epoch: [2] [410/1624] lr: 0.000033 closs: 0.7965 (0.7536) grad_norm: 0.4291 (0.4280) time: 6.0534 data: 0.0002 max mem: 71823 -[04:36:09.999135] Epoch: [2] [420/1624] lr: 0.000033 closs: 0.7686 (0.7531) grad_norm: 0.4415 (0.4300) time: 6.0516 data: 0.0002 max mem: 71823 -[04:37:10.509272] Epoch: [2] [430/1624] lr: 0.000033 closs: 0.7361 (0.7536) grad_norm: 0.4161 (0.4299) time: 6.0559 data: 0.0002 max mem: 71823 -[04:38:11.132613] Epoch: [2] [440/1624] lr: 0.000033 closs: 0.7380 (0.7536) grad_norm: 0.4138 (0.4294) time: 6.0565 data: 0.0002 max mem: 71823 -[04:39:11.583019] Epoch: [2] [450/1624] lr: 0.000033 closs: 0.7156 (0.7538) grad_norm: 0.4138 (0.4296) time: 6.0536 data: 0.0002 max mem: 71823 -[04:40:12.105852] Epoch: [2] [460/1624] lr: 0.000033 closs: 0.7412 (0.7547) grad_norm: 0.4342 (0.4298) time: 6.0485 data: 0.0002 max mem: 71823 -[04:41:12.643122] Epoch: [2] [470/1624] lr: 0.000032 closs: 0.7791 (0.7552) grad_norm: 0.4342 (0.4294) time: 6.0529 data: 0.0002 max mem: 71823 -[04:42:13.219308] Epoch: [2] [480/1624] lr: 0.000032 closs: 0.7791 (0.7568) grad_norm: 0.3875 (0.4283) time: 6.0556 data: 0.0002 max mem: 71823 -[04:43:13.667590] Epoch: [2] [490/1624] lr: 0.000032 closs: 0.8093 (0.7555) grad_norm: 0.3875 (0.4280) time: 6.0511 data: 0.0002 max mem: 71823 -[04:44:14.174983] Epoch: [2] [500/1624] lr: 0.000032 closs: 0.7695 (0.7552) grad_norm: 0.3875 (0.4276) time: 6.0477 data: 0.0002 max mem: 71823 -[04:45:14.665628] Epoch: [2] [510/1624] lr: 0.000032 closs: 0.7695 (0.7549) grad_norm: 0.4001 (0.4276) time: 6.0498 data: 0.0002 max mem: 71823 -[04:46:15.119059] Epoch: [2] [520/1624] lr: 0.000032 closs: 0.6909 (0.7542) grad_norm: 0.4062 (0.4283) time: 6.0471 data: 0.0002 max mem: 71823 -[04:47:15.782263] Epoch: [2] [530/1624] lr: 0.000032 closs: 0.7358 (0.7544) grad_norm: 0.4001 (0.4286) time: 6.0557 data: 0.0002 max mem: 71823 -[04:48:16.211033] Epoch: [2] [540/1624] lr: 0.000031 closs: 0.8122 (0.7553) grad_norm: 0.3886 (0.4280) time: 6.0545 data: 0.0002 max mem: 71823 -[04:49:16.737140] Epoch: [2] [550/1624] lr: 0.000031 closs: 0.8014 (0.7562) grad_norm: 0.3791 (0.4272) time: 6.0476 data: 0.0002 max mem: 71823 -[04:50:17.355261] Epoch: [2] [560/1624] lr: 0.000031 closs: 0.8014 (0.7572) grad_norm: 0.3816 (0.4265) time: 6.0571 data: 0.0002 max mem: 71823 -[04:51:17.740812] Epoch: [2] [570/1624] lr: 0.000031 closs: 0.7336 (0.7563) grad_norm: 0.3844 (0.4266) time: 6.0501 data: 0.0002 max mem: 71823 -[04:52:18.233410] Epoch: [2] [580/1624] lr: 0.000031 closs: 0.7057 (0.7574) grad_norm: 0.3845 (0.4270) time: 6.0438 data: 0.0002 max mem: 71823 -[04:53:18.656774] Epoch: [2] [590/1624] lr: 0.000031 closs: 0.7585 (0.7570) grad_norm: 0.3845 (0.4263) time: 6.0457 data: 0.0002 max mem: 71823 -[04:54:19.139372] Epoch: [2] [600/1624] lr: 0.000031 closs: 0.7585 (0.7577) grad_norm: 0.3931 (0.4259) time: 6.0452 data: 0.0002 max mem: 71823 -[04:55:19.783410] Epoch: [2] [610/1624] lr: 0.000030 closs: 0.7261 (0.7573) grad_norm: 0.3931 (0.4263) time: 6.0562 data: 0.0002 max mem: 71823 -[04:56:20.266272] Epoch: [2] [620/1624] lr: 0.000030 closs: 0.6972 (0.7572) grad_norm: 0.3960 (0.4271) time: 6.0562 data: 0.0002 max mem: 71823 -[04:57:20.812176] Epoch: [2] [630/1624] lr: 0.000030 closs: 0.8105 (0.7582) grad_norm: 0.4058 (0.4266) time: 6.0513 data: 0.0002 max mem: 71823 -[04:58:21.333035] Epoch: [2] [640/1624] lr: 0.000030 closs: 0.8105 (0.7583) grad_norm: 0.4285 (0.4263) time: 6.0532 data: 0.0002 max mem: 71823 -[04:59:21.688713] Epoch: [2] [650/1624] lr: 0.000030 closs: 0.7705 (0.7583) grad_norm: 0.4058 (0.4264) time: 6.0437 data: 0.0002 max mem: 71823 -[05:00:22.146001] Epoch: [2] [660/1624] lr: 0.000030 closs: 0.7681 (0.7593) grad_norm: 0.4096 (0.4270) time: 6.0405 data: 0.0002 max mem: 71823 -[05:01:22.568724] Epoch: [2] [670/1624] lr: 0.000030 closs: 0.7753 (0.7600) grad_norm: 0.4096 (0.4288) time: 6.0439 data: 0.0002 max mem: 71823 -[05:02:22.996899] Epoch: [2] [680/1624] lr: 0.000029 closs: 0.7412 (0.7596) grad_norm: 0.4112 (0.4289) time: 6.0424 data: 0.0002 max mem: 71823 -[05:03:23.483577] Epoch: [2] [690/1624] lr: 0.000029 closs: 0.7272 (0.7599) grad_norm: 0.4164 (0.4295) time: 6.0456 data: 0.0002 max mem: 71823 -[05:04:23.983879] Epoch: [2] [700/1624] lr: 0.000029 closs: 0.7488 (0.7606) grad_norm: 0.4023 (0.4292) time: 6.0492 data: 0.0002 max mem: 71823 -[05:05:24.524557] Epoch: [2] [710/1624] lr: 0.000029 closs: 0.7465 (0.7602) grad_norm: 0.4023 (0.4290) time: 6.0519 data: 0.0002 max mem: 71823 -[05:06:25.081888] Epoch: [2] [720/1624] lr: 0.000029 closs: 0.7425 (0.7601) grad_norm: 0.3933 (0.4285) time: 6.0547 data: 0.0002 max mem: 71823 -[05:07:25.575235] Epoch: [2] [730/1624] lr: 0.000029 closs: 0.7836 (0.7610) grad_norm: 0.3856 (0.4278) time: 6.0524 data: 0.0002 max mem: 71823 -[05:08:26.014281] Epoch: [2] [740/1624] lr: 0.000029 closs: 0.7769 (0.7605) grad_norm: 0.3856 (0.4283) time: 6.0465 data: 0.0002 max mem: 71823 -[05:09:26.332478] Epoch: [2] [750/1624] lr: 0.000028 closs: 0.7147 (0.7602) grad_norm: 0.3841 (0.4277) time: 6.0377 data: 0.0002 max mem: 71823 -[05:10:26.848418] Epoch: [2] [760/1624] lr: 0.000028 closs: 0.7451 (0.7607) grad_norm: 0.4086 (0.4293) time: 6.0416 data: 0.0002 max mem: 71823 -[05:11:27.534295] Epoch: [2] [770/1624] lr: 0.000028 closs: 0.8140 (0.7607) grad_norm: 0.4343 (0.4296) time: 6.0600 data: 0.0002 max mem: 71823 -[05:12:27.885198] Epoch: [2] [780/1624] lr: 0.000028 closs: 0.7992 (0.7611) grad_norm: 0.4281 (0.4296) time: 6.0517 data: 0.0002 max mem: 71823 -[05:13:28.407985] Epoch: [2] [790/1624] lr: 0.000028 closs: 0.6886 (0.7603) grad_norm: 0.4281 (0.4291) time: 6.0436 data: 0.0003 max mem: 71823 -[05:14:29.052515] Epoch: [2] [800/1624] lr: 0.000028 closs: 0.6886 (0.7605) grad_norm: 0.4096 (0.4294) time: 6.0582 data: 0.0003 max mem: 71823 -[05:15:29.458618] Epoch: [2] [810/1624] lr: 0.000028 closs: 0.7142 (0.7603) grad_norm: 0.3949 (0.4290) time: 6.0524 data: 0.0002 max mem: 71823 -[05:16:29.891077] Epoch: [2] [820/1624] lr: 0.000027 closs: 0.6915 (0.7597) grad_norm: 0.3950 (0.4290) time: 6.0418 data: 0.0002 max mem: 71823 -[05:17:30.311904] Epoch: [2] [830/1624] lr: 0.000027 closs: 0.7539 (0.7605) grad_norm: 0.4149 (0.4293) time: 6.0425 data: 0.0002 max mem: 71823 -[05:18:30.844955] Epoch: [2] [840/1624] lr: 0.000027 closs: 0.7703 (0.7605) grad_norm: 0.4254 (0.4313) time: 6.0476 data: 0.0002 max mem: 71823 -[05:19:31.336036] Epoch: [2] [850/1624] lr: 0.000027 closs: 0.7751 (0.7608) grad_norm: 0.4265 (0.4310) time: 6.0511 data: 0.0002 max mem: 71823 -[05:20:31.833606] Epoch: [2] [860/1624] lr: 0.000027 closs: 0.7392 (0.7602) grad_norm: 0.4439 (0.4341) time: 6.0493 data: 0.0002 max mem: 71823 -[05:21:32.413583] Epoch: [2] [870/1624] lr: 0.000027 closs: 0.7324 (0.7606) grad_norm: 0.4394 (0.4339) time: 6.0537 data: 0.0002 max mem: 71823 -[05:22:32.963814] Epoch: [2] [880/1624] lr: 0.000027 closs: 0.7981 (0.7607) grad_norm: 0.4394 (0.4356) time: 6.0564 data: 0.0002 max mem: 71823 -[05:23:33.511646] Epoch: [2] [890/1624] lr: 0.000026 closs: 0.7318 (0.7603) grad_norm: 0.4674 (0.4361) time: 6.0548 data: 0.0002 max mem: 71823 -[05:24:33.951439] Epoch: [2] [900/1624] lr: 0.000026 closs: 0.8069 (0.7612) grad_norm: 0.4595 (0.4360) time: 6.0492 data: 0.0002 max mem: 71823 -[05:25:34.387645] Epoch: [2] [910/1624] lr: 0.000026 closs: 0.8092 (0.7608) grad_norm: 0.4488 (0.4364) time: 6.0437 data: 0.0002 max mem: 71823 -[05:26:34.872735] Epoch: [2] [920/1624] lr: 0.000026 closs: 0.7813 (0.7621) grad_norm: 0.4488 (0.4374) time: 6.0459 data: 0.0002 max mem: 71823 -[05:27:35.441928] Epoch: [2] [930/1624] lr: 0.000026 closs: 0.7813 (0.7616) grad_norm: 0.4294 (0.4372) time: 6.0526 data: 0.0002 max mem: 71823 -[05:28:35.937808] Epoch: [2] [940/1624] lr: 0.000026 closs: 0.7029 (0.7612) grad_norm: 0.4266 (0.4367) time: 6.0531 data: 0.0002 max mem: 71823 -[05:29:36.440045] Epoch: [2] [950/1624] lr: 0.000026 closs: 0.7839 (0.7620) grad_norm: 0.4085 (0.4366) time: 6.0498 data: 0.0002 max mem: 71823 -[05:30:37.041242] Epoch: [2] [960/1624] lr: 0.000025 closs: 0.8103 (0.7623) grad_norm: 0.3868 (0.4362) time: 6.0550 data: 0.0002 max mem: 71823 -[05:31:37.541131] Epoch: [2] [970/1624] lr: 0.000025 closs: 0.7465 (0.7619) grad_norm: 0.3868 (0.4359) time: 6.0549 data: 0.0002 max mem: 71823 -[05:32:37.932059] Epoch: [2] [980/1624] lr: 0.000025 closs: 0.8207 (0.7628) grad_norm: 0.3794 (0.4355) time: 6.0444 data: 0.0002 max mem: 71823 -[05:33:38.436414] Epoch: [2] [990/1624] lr: 0.000025 closs: 0.8473 (0.7637) grad_norm: 0.3986 (0.4354) time: 6.0446 data: 0.0002 max mem: 71823 -[05:34:38.819228] Epoch: [2] [1000/1624] lr: 0.000025 closs: 0.8094 (0.7632) grad_norm: 0.4056 (0.4350) time: 6.0442 data: 0.0002 max mem: 71823 -[05:35:39.370562] Epoch: [2] [1010/1624] lr: 0.000025 closs: 0.7168 (0.7626) grad_norm: 0.4056 (0.4349) time: 6.0465 data: 0.0002 max mem: 71823 -[05:36:39.864540] Epoch: [2] [1020/1624] lr: 0.000024 closs: 0.7539 (0.7631) grad_norm: 0.4056 (0.4345) time: 6.0521 data: 0.0002 max mem: 71823 -[05:37:40.518970] Epoch: [2] [1030/1624] lr: 0.000024 closs: 0.7568 (0.7632) grad_norm: 0.4002 (0.4343) time: 6.0573 data: 0.0002 max mem: 71823 -[05:38:41.137094] Epoch: [2] [1040/1624] lr: 0.000024 closs: 0.7308 (0.7626) grad_norm: 0.3876 (0.4338) time: 6.0635 data: 0.0002 max mem: 71823 -[05:39:41.568197] Epoch: [2] [1050/1624] lr: 0.000024 closs: 0.7144 (0.7625) grad_norm: 0.3846 (0.4335) time: 6.0523 data: 0.0002 max mem: 71823 -[05:40:41.983490] Epoch: [2] [1060/1624] lr: 0.000024 closs: 0.7114 (0.7617) grad_norm: 0.4017 (0.4336) time: 6.0422 data: 0.0002 max mem: 71823 -[05:41:42.391496] Epoch: [2] [1070/1624] lr: 0.000024 closs: 0.7114 (0.7613) grad_norm: 0.3866 (0.4336) time: 6.0410 data: 0.0002 max mem: 71823 -[05:42:42.950623] Epoch: [2] [1080/1624] lr: 0.000024 closs: 0.6962 (0.7607) grad_norm: 0.4061 (0.4334) time: 6.0482 data: 0.0002 max mem: 71823 -[05:43:43.537822] Epoch: [2] [1090/1624] lr: 0.000023 closs: 0.6870 (0.7600) grad_norm: 0.3972 (0.4329) time: 6.0572 data: 0.0002 max mem: 71823 -[05:44:43.993653] Epoch: [2] [1100/1624] lr: 0.000023 closs: 0.6894 (0.7599) grad_norm: 0.3961 (0.4327) time: 6.0520 data: 0.0002 max mem: 71823 -[05:45:44.647224] Epoch: [2] [1110/1624] lr: 0.000023 closs: 0.6894 (0.7595) grad_norm: 0.3961 (0.4327) time: 6.0553 data: 0.0002 max mem: 71823 -[05:46:45.298068] Epoch: [2] [1120/1624] lr: 0.000023 closs: 0.7102 (0.7586) grad_norm: 0.4022 (0.4331) time: 6.0651 data: 0.0002 max mem: 71823 -[05:47:45.751589] Epoch: [2] [1130/1624] lr: 0.000023 closs: 0.7231 (0.7583) grad_norm: 0.4051 (0.4329) time: 6.0551 data: 0.0002 max mem: 71823 -[05:48:46.230120] Epoch: [2] [1140/1624] lr: 0.000023 closs: 0.7175 (0.7579) grad_norm: 0.4130 (0.4328) time: 6.0465 data: 0.0002 max mem: 71823 -[05:49:46.742959] Epoch: [2] [1150/1624] lr: 0.000023 closs: 0.7864 (0.7583) grad_norm: 0.4149 (0.4335) time: 6.0494 data: 0.0002 max mem: 71823 -[05:50:47.051301] Epoch: [2] [1160/1624] lr: 0.000022 closs: 0.8174 (0.7584) grad_norm: 0.4051 (0.4331) time: 6.0409 data: 0.0002 max mem: 71823 -[05:51:47.602673] Epoch: [2] [1170/1624] lr: 0.000022 closs: 0.8162 (0.7586) grad_norm: 0.4193 (0.4331) time: 6.0429 data: 0.0002 max mem: 71823 -[05:52:48.052808] Epoch: [2] [1180/1624] lr: 0.000022 closs: 0.7332 (0.7583) grad_norm: 0.4258 (0.4336) time: 6.0499 data: 0.0002 max mem: 71823 -[05:53:48.598973] Epoch: [2] [1190/1624] lr: 0.000022 closs: 0.7115 (0.7580) grad_norm: 0.4050 (0.4335) time: 6.0497 data: 0.0002 max mem: 71823 -[05:54:49.278060] Epoch: [2] [1200/1624] lr: 0.000022 closs: 0.7276 (0.7577) grad_norm: 0.4050 (0.4332) time: 6.0611 data: 0.0002 max mem: 71823 -[05:55:49.707040] Epoch: [2] [1210/1624] lr: 0.000022 closs: 0.7425 (0.7573) grad_norm: 0.4031 (0.4328) time: 6.0553 data: 0.0002 max mem: 71823 -[05:56:50.177577] Epoch: [2] [1220/1624] lr: 0.000022 closs: 0.7539 (0.7572) grad_norm: 0.4031 (0.4333) time: 6.0448 data: 0.0002 max mem: 71823 -[05:57:50.648654] Epoch: [2] [1230/1624] lr: 0.000022 closs: 0.7539 (0.7571) grad_norm: 0.4057 (0.4334) time: 6.0470 data: 0.0002 max mem: 71823 -[05:58:51.208104] Epoch: [2] [1240/1624] lr: 0.000021 closs: 0.7109 (0.7569) grad_norm: 0.4057 (0.4331) time: 6.0514 data: 0.0002 max mem: 71823 -[05:59:51.727205] Epoch: [2] [1250/1624] lr: 0.000021 closs: 0.7003 (0.7565) grad_norm: 0.4221 (0.4332) time: 6.0538 data: 0.0002 max mem: 71823 -[06:00:52.188567] Epoch: [2] [1260/1624] lr: 0.000021 closs: 0.7847 (0.7570) grad_norm: 0.4102 (0.4330) time: 6.0489 data: 0.0002 max mem: 71823 -[06:01:52.739651] Epoch: [2] [1270/1624] lr: 0.000021 closs: 0.7646 (0.7569) grad_norm: 0.3921 (0.4329) time: 6.0505 data: 0.0002 max mem: 71823 -[06:02:53.295334] Epoch: [2] [1280/1624] lr: 0.000021 closs: 0.7208 (0.7566) grad_norm: 0.3921 (0.4327) time: 6.0552 data: 0.0002 max mem: 71823 -[06:03:53.661534] Epoch: [2] [1290/1624] lr: 0.000021 closs: 0.7191 (0.7562) grad_norm: 0.3865 (0.4326) time: 6.0460 data: 0.0002 max mem: 71823 -[06:04:54.021870] Epoch: [2] [1300/1624] lr: 0.000021 closs: 0.6820 (0.7555) grad_norm: 0.4109 (0.4331) time: 6.0362 data: 0.0002 max mem: 71823 -[06:05:54.674480] Epoch: [2] [1310/1624] lr: 0.000020 closs: 0.6813 (0.7549) grad_norm: 0.4214 (0.4332) time: 6.0505 data: 0.0002 max mem: 71823 -[06:06:55.189564] Epoch: [2] [1320/1624] lr: 0.000020 closs: 0.7807 (0.7551) grad_norm: 0.4214 (0.4332) time: 6.0582 data: 0.0002 max mem: 71823 -[06:07:55.648512] Epoch: [2] [1330/1624] lr: 0.000020 closs: 0.7807 (0.7550) grad_norm: 0.4045 (0.4332) time: 6.0485 data: 0.0002 max mem: 71823 -[06:08:56.124112] Epoch: [2] [1340/1624] lr: 0.000020 closs: 0.7035 (0.7552) grad_norm: 0.4045 (0.4331) time: 6.0466 data: 0.0002 max mem: 71823 -[06:09:56.645297] Epoch: [2] [1350/1624] lr: 0.000020 closs: 0.7091 (0.7549) grad_norm: 0.4059 (0.4333) time: 6.0497 data: 0.0002 max mem: 71823 -[06:10:57.158394] Epoch: [2] [1360/1624] lr: 0.000020 closs: 0.7633 (0.7554) grad_norm: 0.4022 (0.4329) time: 6.0516 data: 0.0002 max mem: 71823 -[06:11:57.681184] Epoch: [2] [1370/1624] lr: 0.000020 closs: 0.7403 (0.7544) grad_norm: 0.4059 (0.4327) time: 6.0517 data: 0.0002 max mem: 71823 -[06:12:58.269421] Epoch: [2] [1380/1624] lr: 0.000019 closs: 0.6815 (0.7544) grad_norm: 0.4068 (0.4325) time: 6.0554 data: 0.0002 max mem: 71823 -[06:13:58.582291] Epoch: [2] [1390/1624] lr: 0.000019 closs: 0.7529 (0.7540) grad_norm: 0.4075 (0.4335) time: 6.0449 data: 0.0002 max mem: 71823 -[06:14:58.975408] Epoch: [2] [1400/1624] lr: 0.000019 closs: 0.7529 (0.7536) grad_norm: 0.4246 (0.4335) time: 6.0352 data: 0.0002 max mem: 71823 -[06:15:59.411158] Epoch: [2] [1410/1624] lr: 0.000019 closs: 0.6829 (0.7527) grad_norm: 0.4184 (0.4333) time: 6.0413 data: 0.0002 max mem: 71823 -[06:16:59.918068] Epoch: [2] [1420/1624] lr: 0.000019 closs: 0.6633 (0.7518) grad_norm: 0.4118 (0.4331) time: 6.0470 data: 0.0002 max mem: 71823 -[06:18:00.423575] Epoch: [2] [1430/1624] lr: 0.000019 closs: 0.7077 (0.7517) grad_norm: 0.4073 (0.4330) time: 6.0505 data: 0.0002 max mem: 71823 -[06:19:00.987143] Epoch: [2] [1440/1624] lr: 0.000019 closs: 0.7660 (0.7522) grad_norm: 0.4073 (0.4352) time: 6.0533 data: 0.0002 max mem: 71823 -[06:20:01.391388] Epoch: [2] [1450/1624] lr: 0.000019 closs: 0.7862 (0.7523) grad_norm: 0.4011 (0.4348) time: 6.0483 data: 0.0002 max mem: 71823 -[06:21:01.845263] Epoch: [2] [1460/1624] lr: 0.000018 closs: 0.7727 (0.7530) grad_norm: 0.4109 (0.4351) time: 6.0428 data: 0.0002 max mem: 71823 -[06:22:02.211455] Epoch: [2] [1470/1624] lr: 0.000018 closs: 0.8205 (0.7538) grad_norm: 0.4136 (0.4356) time: 6.0408 data: 0.0002 max mem: 71823 -[06:23:02.771182] Epoch: [2] [1480/1624] lr: 0.000018 closs: 0.7405 (0.7529) grad_norm: 0.4174 (0.4354) time: 6.0461 data: 0.0002 max mem: 71823 -[06:24:03.312530] Epoch: [2] [1490/1624] lr: 0.000018 closs: 0.7405 (0.7532) grad_norm: 0.4403 (0.4358) time: 6.0549 data: 0.0002 max mem: 71823 -[06:25:03.937979] Epoch: [2] [1500/1624] lr: 0.000018 closs: 0.7526 (0.7532) grad_norm: 0.4225 (0.4354) time: 6.0582 data: 0.0002 max mem: 71823 -[06:26:04.454022] Epoch: [2] [1510/1624] lr: 0.000018 closs: 0.7919 (0.7537) grad_norm: 0.4019 (0.4353) time: 6.0569 data: 0.0002 max mem: 71823 -[06:27:05.036824] Epoch: [2] [1520/1624] lr: 0.000018 closs: 0.7819 (0.7535) grad_norm: 0.3997 (0.4355) time: 6.0548 data: 0.0002 max mem: 71823 -[06:28:05.541565] Epoch: [2] [1530/1624] lr: 0.000017 closs: 0.7249 (0.7536) grad_norm: 0.3997 (0.4353) time: 6.0542 data: 0.0002 max mem: 71823 -[06:29:05.989256] Epoch: [2] [1540/1624] lr: 0.000017 closs: 0.7684 (0.7539) grad_norm: 0.4038 (0.4354) time: 6.0475 data: 0.0002 max mem: 71823 -[06:30:06.551140] Epoch: [2] [1550/1624] lr: 0.000017 closs: 0.7387 (0.7534) grad_norm: 0.4339 (0.4362) time: 6.0503 data: 0.0002 max mem: 71823 -[06:31:07.054072] Epoch: [2] [1560/1624] lr: 0.000017 closs: 0.7051 (0.7537) grad_norm: 0.4062 (0.4359) time: 6.0531 data: 0.0002 max mem: 71823 -[06:32:07.659174] Epoch: [2] [1570/1624] lr: 0.000017 closs: 0.7603 (0.7538) grad_norm: 0.3874 (0.4355) time: 6.0553 data: 0.0002 max mem: 71823 -[06:33:08.088673] Epoch: [2] [1580/1624] lr: 0.000017 closs: 0.7580 (0.7537) grad_norm: 0.3773 (0.4352) time: 6.0516 data: 0.0002 max mem: 71823 -[06:34:08.598365] Epoch: [2] [1590/1624] lr: 0.000017 closs: 0.7667 (0.7544) grad_norm: 0.3773 (0.4354) time: 6.0468 data: 0.0003 max mem: 71823 -[06:35:09.174978] Epoch: [2] [1600/1624] lr: 0.000017 closs: 0.7395 (0.7540) grad_norm: 0.3773 (0.4351) time: 6.0542 data: 0.0003 max mem: 71823 -[06:36:09.668581] Epoch: [2] [1610/1624] lr: 0.000016 closs: 0.6854 (0.7540) grad_norm: 0.3773 (0.4349) time: 6.0534 data: 0.0002 max mem: 71823 -[06:37:10.047331] Epoch: [2] [1620/1624] lr: 0.000016 closs: 0.7070 (0.7537) grad_norm: 0.3849 (0.4348) time: 6.0435 data: 0.0002 max mem: 71823 -[06:37:28.654006] Epoch: [2] Total time: 2:43:47 -[06:37:28.810382] Averaged stats: lr: 0.000016 closs: 0.6684 (0.7501) grad_norm: 0.3771 (0.4348) -[06:37:29.813635] model saved -[06:37:34.331499] optimizer saved -[06:37:34.332562] other rank-common saved -[06:37:34.339713] rank-specific saved -[06:37:34.360052] log_dir: ./output_dir -[06:37:42.946111] Epoch: [3] [0/1624] lr: 0.000016 closs: 0.5983 (0.5983) time: 8.5855 data: 2.6018 max mem: 71823 -[06:38:43.310207] Epoch: [3] [10/1624] lr: 0.000016 closs: 0.7275 (0.7730) grad_norm: 0.4378 (0.4154) time: 6.2680 data: 0.2367 max mem: 71823 -[06:39:43.755801] Epoch: [3] [20/1624] lr: 0.000016 closs: 0.7925 (0.8144) grad_norm: 0.3806 (0.3987) time: 6.0403 data: 0.0002 max mem: 71823 -[06:40:44.205969] Epoch: [3] [30/1624] lr: 0.000016 closs: 0.7588 (0.7715) grad_norm: 0.3915 (0.4020) time: 6.0446 data: 0.0002 max mem: 71823 -[06:41:44.808674] Epoch: [3] [40/1624] lr: 0.000016 closs: 0.7015 (0.7587) grad_norm: 0.3915 (0.4019) time: 6.0525 data: 0.0002 max mem: 71823 -[06:42:45.317774] Epoch: [3] [50/1624] lr: 0.000016 closs: 0.7474 (0.7657) grad_norm: 0.3915 (0.4075) time: 6.0554 data: 0.0002 max mem: 71823 -[06:43:45.944565] Epoch: [3] [60/1624] lr: 0.000016 closs: 0.7721 (0.7542) grad_norm: 0.4101 (0.4094) time: 6.0566 data: 0.0002 max mem: 71823 -[06:44:46.388224] Epoch: [3] [70/1624] lr: 0.000015 closs: 0.7732 (0.7606) grad_norm: 0.4075 (0.4101) time: 6.0534 data: 0.0002 max mem: 71823 -[06:45:46.817158] Epoch: [3] [80/1624] lr: 0.000015 closs: 0.7760 (0.7536) grad_norm: 0.4322 (0.4202) time: 6.0435 data: 0.0002 max mem: 71823 -[06:46:47.406426] Epoch: [3] [90/1624] lr: 0.000015 closs: 0.6982 (0.7466) grad_norm: 0.4134 (0.4174) time: 6.0508 data: 0.0002 max mem: 71823 -[06:47:47.764402] Epoch: [3] [100/1624] lr: 0.000015 closs: 0.7089 (0.7496) grad_norm: 0.4075 (0.4168) time: 6.0472 data: 0.0002 max mem: 71823 -[06:48:48.221348] Epoch: [3] [110/1624] lr: 0.000015 closs: 0.7107 (0.7463) grad_norm: 0.3935 (0.4143) time: 6.0406 data: 0.0002 max mem: 71823 -[06:49:48.726168] Epoch: [3] [120/1624] lr: 0.000015 closs: 0.6929 (0.7486) grad_norm: 0.3764 (0.4121) time: 6.0480 data: 0.0002 max mem: 71823 -[06:50:49.116959] Epoch: [3] [130/1624] lr: 0.000015 closs: 0.7338 (0.7475) grad_norm: 0.3904 (0.4448) time: 6.0447 data: 0.0002 max mem: 71823 -[06:51:49.583295] Epoch: [3] [140/1624] lr: 0.000015 closs: 0.7059 (0.7431) grad_norm: 0.3934 (0.4539) time: 6.0427 data: 0.0002 max mem: 71823 -[06:52:50.176324] Epoch: [3] [150/1624] lr: 0.000014 closs: 0.7392 (0.7425) grad_norm: 0.3920 (0.4526) time: 6.0528 data: 0.0002 max mem: 71823 -[06:53:50.550479] Epoch: [3] [160/1624] lr: 0.000014 closs: 0.7257 (0.7397) grad_norm: 0.4105 (0.4517) time: 6.0482 data: 0.0002 max mem: 71823 -[06:54:51.267043] Epoch: [3] [170/1624] lr: 0.000014 closs: 0.6759 (0.7369) grad_norm: 0.4024 (0.4496) time: 6.0544 data: 0.0002 max mem: 71823 -[06:55:51.755392] Epoch: [3] [180/1624] lr: 0.000014 closs: 0.7266 (0.7396) grad_norm: 0.4004 (0.4466) time: 6.0601 data: 0.0002 max mem: 71823 -[06:56:52.231148] Epoch: [3] [190/1624] lr: 0.000014 closs: 0.7507 (0.7386) grad_norm: 0.4044 (0.4486) time: 6.0481 data: 0.0002 max mem: 71823 -[06:57:52.779241] Epoch: [3] [200/1624] lr: 0.000014 closs: 0.7266 (0.7396) grad_norm: 0.4044 (0.4479) time: 6.0511 data: 0.0002 max mem: 71823 -[06:58:53.169887] Epoch: [3] [210/1624] lr: 0.000014 closs: 0.7444 (0.7381) grad_norm: 0.4046 (0.4462) time: 6.0468 data: 0.0002 max mem: 71823 -[06:59:53.740198] Epoch: [3] [220/1624] lr: 0.000014 closs: 0.7350 (0.7386) grad_norm: 0.4186 (0.4465) time: 6.0479 data: 0.0002 max mem: 71823 -[07:00:54.228908] Epoch: [3] [230/1624] lr: 0.000013 closs: 0.7393 (0.7407) grad_norm: 0.4203 (0.4468) time: 6.0528 data: 0.0002 max mem: 71823 -[07:01:54.715324] Epoch: [3] [240/1624] lr: 0.000013 closs: 0.7607 (0.7462) grad_norm: 0.4216 (0.4460) time: 6.0486 data: 0.0002 max mem: 71823 -[07:02:55.175495] Epoch: [3] [250/1624] lr: 0.000013 closs: 0.7558 (0.7467) grad_norm: 0.4216 (0.4440) time: 6.0472 data: 0.0002 max mem: 71823 -[07:03:55.600509] Epoch: [3] [260/1624] lr: 0.000013 closs: 0.6772 (0.7448) grad_norm: 0.4216 (0.4444) time: 6.0441 data: 0.0002 max mem: 71823 -[07:04:56.076555] Epoch: [3] [270/1624] lr: 0.000013 closs: 0.6772 (0.7451) grad_norm: 0.3897 (0.4427) time: 6.0449 data: 0.0002 max mem: 71823 -[07:05:56.489474] Epoch: [3] [280/1624] lr: 0.000013 closs: 0.8266 (0.7472) grad_norm: 0.3964 (0.4425) time: 6.0443 data: 0.0002 max mem: 71823 -[07:06:56.926991] Epoch: [3] [290/1624] lr: 0.000013 closs: 0.7711 (0.7478) grad_norm: 0.4028 (0.4417) time: 6.0424 data: 0.0002 max mem: 71823 -[07:07:57.512522] Epoch: [3] [300/1624] lr: 0.000013 closs: 0.7695 (0.7480) grad_norm: 0.4028 (0.4426) time: 6.0510 data: 0.0002 max mem: 71823 -[07:08:58.074944] Epoch: [3] [310/1624] lr: 0.000013 closs: 0.7483 (0.7472) grad_norm: 0.3996 (0.4398) time: 6.0573 data: 0.0002 max mem: 71823 -[07:09:58.639942] Epoch: [3] [320/1624] lr: 0.000012 closs: 0.7503 (0.7495) grad_norm: 0.4077 (0.4459) time: 6.0562 data: 0.0002 max mem: 71823 -[07:10:59.257986] Epoch: [3] [330/1624] lr: 0.000012 closs: 0.7271 (0.7483) grad_norm: 0.4154 (0.4458) time: 6.0590 data: 0.0002 max mem: 71823 -[07:11:59.788667] Epoch: [3] [340/1624] lr: 0.000012 closs: 0.7243 (0.7481) grad_norm: 0.4369 (0.4460) time: 6.0573 data: 0.0002 max mem: 71823 -[07:13:00.259757] Epoch: [3] [350/1624] lr: 0.000012 closs: 0.7294 (0.7477) grad_norm: 0.4538 (0.4456) time: 6.0500 data: 0.0002 max mem: 71823 -[07:14:00.770498] Epoch: [3] [360/1624] lr: 0.000012 closs: 0.7235 (0.7480) grad_norm: 0.4398 (0.4444) time: 6.0490 data: 0.0002 max mem: 71823 -[07:15:01.254070] Epoch: [3] [370/1624] lr: 0.000012 closs: 0.7235 (0.7485) grad_norm: 0.4108 (0.4433) time: 6.0496 data: 0.0002 max mem: 71823 -[07:16:01.868131] Epoch: [3] [380/1624] lr: 0.000012 closs: 0.7501 (0.7489) grad_norm: 0.4108 (0.4433) time: 6.0547 data: 0.0002 max mem: 71823 -[07:17:02.439789] Epoch: [3] [390/1624] lr: 0.000012 closs: 0.7039 (0.7472) grad_norm: 0.4089 (0.4423) time: 6.0591 data: 0.0002 max mem: 71823 -[07:18:02.968432] Epoch: [3] [400/1624] lr: 0.000012 closs: 0.7039 (0.7471) grad_norm: 0.4108 (0.4412) time: 6.0549 data: 0.0002 max mem: 71823 -[07:19:03.444351] Epoch: [3] [410/1624] lr: 0.000012 closs: 0.7397 (0.7483) grad_norm: 0.4114 (0.4404) time: 6.0501 data: 0.0002 max mem: 71823 -[07:20:03.958426] Epoch: [3] [420/1624] lr: 0.000011 closs: 0.6928 (0.7457) grad_norm: 0.4061 (0.4397) time: 6.0494 data: 0.0002 max mem: 71823 -[07:21:04.613775] Epoch: [3] [430/1624] lr: 0.000011 closs: 0.6928 (0.7470) grad_norm: 0.4133 (0.4398) time: 6.0583 data: 0.0002 max mem: 71823 -[07:22:05.036533] Epoch: [3] [440/1624] lr: 0.000011 closs: 0.7394 (0.7484) grad_norm: 0.4061 (0.4388) time: 6.0538 data: 0.0002 max mem: 71823 -[07:23:05.550939] Epoch: [3] [450/1624] lr: 0.000011 closs: 0.8170 (0.7503) grad_norm: 0.4007 (0.4376) time: 6.0467 data: 0.0002 max mem: 71823 -[07:24:06.112524] Epoch: [3] [460/1624] lr: 0.000011 closs: 0.7939 (0.7503) grad_norm: 0.3965 (0.4372) time: 6.0536 data: 0.0002 max mem: 71823 -[07:25:06.640714] Epoch: [3] [470/1624] lr: 0.000011 closs: 0.7612 (0.7516) grad_norm: 0.3949 (0.4377) time: 6.0543 data: 0.0002 max mem: 71823 -[07:26:07.211734] Epoch: [3] [480/1624] lr: 0.000011 closs: 0.7924 (0.7511) grad_norm: 0.4007 (0.4381) time: 6.0548 data: 0.0002 max mem: 71823 -[07:27:07.792915] Epoch: [3] [490/1624] lr: 0.000011 closs: 0.7278 (0.7508) grad_norm: 0.4180 (0.4374) time: 6.0575 data: 0.0002 max mem: 71823 -[07:28:08.340086] Epoch: [3] [500/1624] lr: 0.000011 closs: 0.7245 (0.7494) grad_norm: 0.4317 (0.4399) time: 6.0563 data: 0.0002 max mem: 71823 -[07:29:08.881437] Epoch: [3] [510/1624] lr: 0.000011 closs: 0.6626 (0.7492) grad_norm: 0.4156 (0.4398) time: 6.0543 data: 0.0002 max mem: 71823 -[07:30:09.336723] Epoch: [3] [520/1624] lr: 0.000010 closs: 0.7220 (0.7493) grad_norm: 0.4139 (0.4387) time: 6.0497 data: 0.0002 max mem: 71823 -[07:31:09.881534] Epoch: [3] [530/1624] lr: 0.000010 closs: 0.7427 (0.7496) grad_norm: 0.4139 (0.4383) time: 6.0499 data: 0.0002 max mem: 71823 -[07:32:10.440703] Epoch: [3] [540/1624] lr: 0.000010 closs: 0.8222 (0.7511) grad_norm: 0.4139 (0.4390) time: 6.0551 data: 0.0002 max mem: 71823 -[07:33:10.953965] Epoch: [3] [550/1624] lr: 0.000010 closs: 0.7388 (0.7498) grad_norm: 0.4113 (0.4385) time: 6.0535 data: 0.0002 max mem: 71823 -[07:34:11.438838] Epoch: [3] [560/1624] lr: 0.000010 closs: 0.7891 (0.7517) grad_norm: 0.3964 (0.4376) time: 6.0498 data: 0.0002 max mem: 71823 -[07:35:11.879198] Epoch: [3] [570/1624] lr: 0.000010 closs: 0.7590 (0.7510) grad_norm: 0.4142 (0.4378) time: 6.0461 data: 0.0002 max mem: 71823 -[07:36:12.483554] Epoch: [3] [580/1624] lr: 0.000010 closs: 0.7302 (0.7515) grad_norm: 0.3961 (0.4370) time: 6.0521 data: 0.0002 max mem: 71823 -[07:37:12.899772] Epoch: [3] [590/1624] lr: 0.000010 closs: 0.6813 (0.7494) grad_norm: 0.4049 (0.4367) time: 6.0509 data: 0.0002 max mem: 71823 -[07:38:13.295546] Epoch: [3] [600/1624] lr: 0.000010 closs: 0.6480 (0.7493) grad_norm: 0.4151 (0.4366) time: 6.0404 data: 0.0002 max mem: 71823 -[07:39:13.737319] Epoch: [3] [610/1624] lr: 0.000010 closs: 0.7296 (0.7495) grad_norm: 0.4069 (0.4363) time: 6.0417 data: 0.0002 max mem: 71823 -[07:40:14.323277] Epoch: [3] [620/1624] lr: 0.000010 closs: 0.7292 (0.7496) grad_norm: 0.4110 (0.4367) time: 6.0512 data: 0.0002 max mem: 71823 -[07:41:14.793487] Epoch: [3] [630/1624] lr: 0.000009 closs: 0.7290 (0.7495) grad_norm: 0.4110 (0.4369) time: 6.0527 data: 0.0002 max mem: 71823 -[07:42:15.284097] Epoch: [3] [640/1624] lr: 0.000009 closs: 0.7228 (0.7494) grad_norm: 0.4203 (0.4369) time: 6.0479 data: 0.0002 max mem: 71823 -[07:43:15.859972] Epoch: [3] [650/1624] lr: 0.000009 closs: 0.7151 (0.7482) grad_norm: 0.4174 (0.4365) time: 6.0532 data: 0.0002 max mem: 71823 -[07:44:16.221420] Epoch: [3] [660/1624] lr: 0.000009 closs: 0.6643 (0.7476) grad_norm: 0.4174 (0.4360) time: 6.0467 data: 0.0002 max mem: 71823 -[07:45:16.748663] Epoch: [3] [670/1624] lr: 0.000009 closs: 0.6893 (0.7477) grad_norm: 0.4174 (0.4365) time: 6.0443 data: 0.0002 max mem: 71823 -[07:46:17.221917] Epoch: [3] [680/1624] lr: 0.000009 closs: 0.7292 (0.7471) grad_norm: 0.4085 (0.4362) time: 6.0499 data: 0.0002 max mem: 71823 -[07:47:17.707286] Epoch: [3] [690/1624] lr: 0.000009 closs: 0.7680 (0.7481) grad_norm: 0.4194 (0.4365) time: 6.0478 data: 0.0002 max mem: 71823 -[07:48:18.403745] Epoch: [3] [700/1624] lr: 0.000009 closs: 0.8011 (0.7479) grad_norm: 0.4260 (0.4366) time: 6.0589 data: 0.0002 max mem: 71823 -[07:49:18.839561] Epoch: [3] [710/1624] lr: 0.000009 closs: 0.8140 (0.7482) grad_norm: 0.4260 (0.4369) time: 6.0565 data: 0.0002 max mem: 71823 -[07:50:19.238241] Epoch: [3] [720/1624] lr: 0.000009 closs: 0.7311 (0.7473) grad_norm: 0.4222 (0.4367) time: 6.0416 data: 0.0002 max mem: 71823 -[07:51:19.628643] Epoch: [3] [730/1624] lr: 0.000009 closs: 0.7190 (0.7467) grad_norm: 0.4148 (0.4362) time: 6.0393 data: 0.0002 max mem: 71823 -[07:52:20.151833] Epoch: [3] [740/1624] lr: 0.000009 closs: 0.6668 (0.7458) grad_norm: 0.4035 (0.4358) time: 6.0455 data: 0.0002 max mem: 71823 -[07:53:20.547144] Epoch: [3] [750/1624] lr: 0.000008 closs: 0.6314 (0.7438) grad_norm: 0.4036 (0.4361) time: 6.0458 data: 0.0002 max mem: 71823 -[07:54:21.013671] Epoch: [3] [760/1624] lr: 0.000008 closs: 0.6351 (0.7430) grad_norm: 0.3988 (0.4354) time: 6.0430 data: 0.0002 max mem: 71823 -[07:55:21.517490] Epoch: [3] [770/1624] lr: 0.000008 closs: 0.7029 (0.7425) grad_norm: 0.4035 (0.4355) time: 6.0484 data: 0.0002 max mem: 71823 -[07:56:22.020959] Epoch: [3] [780/1624] lr: 0.000008 closs: 0.7014 (0.7422) grad_norm: 0.3876 (0.4354) time: 6.0502 data: 0.0002 max mem: 71823 -[07:57:22.529816] Epoch: [3] [790/1624] lr: 0.000008 closs: 0.7255 (0.7422) grad_norm: 0.3783 (0.4357) time: 6.0505 data: 0.0002 max mem: 71823 -[07:58:22.965729] Epoch: [3] [800/1624] lr: 0.000008 closs: 0.6997 (0.7418) grad_norm: 0.4052 (0.4355) time: 6.0471 data: 0.0002 max mem: 71823 -[07:59:23.333831] Epoch: [3] [810/1624] lr: 0.000008 closs: 0.6842 (0.7411) grad_norm: 0.4311 (0.4365) time: 6.0401 data: 0.0002 max mem: 71823 -[08:00:23.813551] Epoch: [3] [820/1624] lr: 0.000008 closs: 0.7208 (0.7411) grad_norm: 0.4473 (0.4365) time: 6.0423 data: 0.0002 max mem: 71823 -[08:01:24.247535] Epoch: [3] [830/1624] lr: 0.000008 closs: 0.7344 (0.7409) grad_norm: 0.4085 (0.4355) time: 6.0456 data: 0.0002 max mem: 71823 -[08:02:24.922307] Epoch: [3] [840/1624] lr: 0.000008 closs: 0.7102 (0.7406) grad_norm: 0.4061 (0.4360) time: 6.0553 data: 0.0002 max mem: 71823 -[08:03:25.388434] Epoch: [3] [850/1624] lr: 0.000008 closs: 0.6744 (0.7404) grad_norm: 0.3810 (0.4356) time: 6.0569 data: 0.0002 max mem: 71823 -[08:04:25.968924] Epoch: [3] [860/1624] lr: 0.000008 closs: 0.7216 (0.7402) grad_norm: 0.3875 (0.4359) time: 6.0522 data: 0.0002 max mem: 71823 -[08:05:26.434800] Epoch: [3] [870/1624] lr: 0.000008 closs: 0.7141 (0.7395) grad_norm: 0.3910 (0.4352) time: 6.0522 data: 0.0002 max mem: 71823 -[08:06:26.921792] Epoch: [3] [880/1624] lr: 0.000008 closs: 0.6988 (0.7393) grad_norm: 0.4192 (0.4363) time: 6.0475 data: 0.0002 max mem: 71823 -[08:07:27.475925] Epoch: [3] [890/1624] lr: 0.000007 closs: 0.7391 (0.7394) grad_norm: 0.4219 (0.4362) time: 6.0519 data: 0.0002 max mem: 71823 -[08:08:28.014396] Epoch: [3] [900/1624] lr: 0.000007 closs: 0.6470 (0.7388) grad_norm: 0.4192 (0.4358) time: 6.0545 data: 0.0002 max mem: 71823 -[08:09:28.474224] Epoch: [3] [910/1624] lr: 0.000007 closs: 0.6444 (0.7383) grad_norm: 0.4200 (0.4364) time: 6.0498 data: 0.0002 max mem: 71823 -[08:10:28.945320] Epoch: [3] [920/1624] lr: 0.000007 closs: 0.7081 (0.7380) grad_norm: 0.4039 (0.4368) time: 6.0464 data: 0.0002 max mem: 71823 -[08:11:29.324558] Epoch: [3] [930/1624] lr: 0.000007 closs: 0.7514 (0.7386) grad_norm: 0.4050 (0.4368) time: 6.0424 data: 0.0002 max mem: 71823 -[08:12:29.892712] Epoch: [3] [940/1624] lr: 0.000007 closs: 0.7514 (0.7383) grad_norm: 0.4050 (0.4363) time: 6.0472 data: 0.0002 max mem: 71823 -[08:13:30.396168] Epoch: [3] [950/1624] lr: 0.000007 closs: 0.7649 (0.7391) grad_norm: 0.4050 (0.4360) time: 6.0534 data: 0.0002 max mem: 71823 -[08:14:30.949100] Epoch: [3] [960/1624] lr: 0.000007 closs: 0.7385 (0.7387) grad_norm: 0.4190 (0.4366) time: 6.0527 data: 0.0002 max mem: 71823 -[08:15:31.427605] Epoch: [3] [970/1624] lr: 0.000007 closs: 0.7310 (0.7388) grad_norm: 0.3980 (0.4361) time: 6.0514 data: 0.0002 max mem: 71823 -[08:16:31.867276] Epoch: [3] [980/1624] lr: 0.000007 closs: 0.7638 (0.7394) grad_norm: 0.4190 (0.4360) time: 6.0458 data: 0.0002 max mem: 71823 -[08:17:32.374950] Epoch: [3] [990/1624] lr: 0.000007 closs: 0.7601 (0.7388) grad_norm: 0.4190 (0.4360) time: 6.0472 data: 0.0002 max mem: 71823 -[08:18:32.696247] Epoch: [3] [1000/1624] lr: 0.000007 closs: 0.6931 (0.7383) grad_norm: 0.3894 (0.4359) time: 6.0413 data: 0.0002 max mem: 71823 -[08:19:33.532745] Epoch: [3] [1010/1624] lr: 0.000007 closs: 0.7123 (0.7388) grad_norm: 0.4108 (0.4356) time: 6.0577 data: 0.0002 max mem: 71823 -[08:20:34.066742] Epoch: [3] [1020/1624] lr: 0.000007 closs: 0.7333 (0.7386) grad_norm: 0.4076 (0.4361) time: 6.0684 data: 0.0002 max mem: 71823 -[08:21:34.443369] Epoch: [3] [1030/1624] lr: 0.000007 closs: 0.6746 (0.7380) grad_norm: 0.4076 (0.4356) time: 6.0454 data: 0.0002 max mem: 71823 -[08:22:34.986155] Epoch: [3] [1040/1624] lr: 0.000007 closs: 0.6767 (0.7384) grad_norm: 0.4245 (0.4356) time: 6.0458 data: 0.0002 max mem: 71823 -[08:23:35.338958] Epoch: [3] [1050/1624] lr: 0.000007 closs: 0.6982 (0.7384) grad_norm: 0.4109 (0.4353) time: 6.0446 data: 0.0002 max mem: 71823 -[08:24:35.695356] Epoch: [3] [1060/1624] lr: 0.000006 closs: 0.6699 (0.7382) grad_norm: 0.4245 (0.4361) time: 6.0353 data: 0.0002 max mem: 71823 -[08:25:36.034063] Epoch: [3] [1070/1624] lr: 0.000006 closs: 0.7570 (0.7384) grad_norm: 0.4389 (0.4359) time: 6.0346 data: 0.0002 max mem: 71823 -[08:26:36.445268] Epoch: [3] [1080/1624] lr: 0.000006 closs: 0.7789 (0.7388) grad_norm: 0.4233 (0.4359) time: 6.0374 data: 0.0002 max mem: 71823 -[08:27:36.999046] Epoch: [3] [1090/1624] lr: 0.000006 closs: 0.7446 (0.7385) grad_norm: 0.4151 (0.4357) time: 6.0481 data: 0.0002 max mem: 71823 -[08:28:37.323634] Epoch: [3] [1100/1624] lr: 0.000006 closs: 0.6911 (0.7390) grad_norm: 0.4067 (0.4357) time: 6.0438 data: 0.0002 max mem: 71823 -[08:29:37.796685] Epoch: [3] [1110/1624] lr: 0.000006 closs: 0.7722 (0.7394) grad_norm: 0.4067 (0.4356) time: 6.0397 data: 0.0002 max mem: 71823 -[08:30:38.431613] Epoch: [3] [1120/1624] lr: 0.000006 closs: 0.7677 (0.7395) grad_norm: 0.4041 (0.4354) time: 6.0553 data: 0.0002 max mem: 71823 -[08:31:38.688031] Epoch: [3] [1130/1624] lr: 0.000006 closs: 0.7303 (0.7403) grad_norm: 0.4087 (0.4354) time: 6.0444 data: 0.0002 max mem: 71823 -[08:32:39.027255] Epoch: [3] [1140/1624] lr: 0.000006 closs: 0.7303 (0.7400) grad_norm: 0.4293 (0.4355) time: 6.0297 data: 0.0002 max mem: 71823 -[08:33:39.511053] Epoch: [3] [1150/1624] lr: 0.000006 closs: 0.7460 (0.7402) grad_norm: 0.4346 (0.4356) time: 6.0410 data: 0.0002 max mem: 71823 -[08:34:39.976294] Epoch: [3] [1160/1624] lr: 0.000006 closs: 0.7460 (0.7403) grad_norm: 0.4416 (0.4355) time: 6.0473 data: 0.0002 max mem: 71823 -[08:35:40.620959] Epoch: [3] [1170/1624] lr: 0.000006 closs: 0.7206 (0.7403) grad_norm: 0.4042 (0.4350) time: 6.0553 data: 0.0002 max mem: 71823 -[08:36:41.178954] Epoch: [3] [1180/1624] lr: 0.000006 closs: 0.7206 (0.7404) grad_norm: 0.4023 (0.4348) time: 6.0600 data: 0.0002 max mem: 71823 -[08:37:41.785757] Epoch: [3] [1190/1624] lr: 0.000006 closs: 0.7266 (0.7399) grad_norm: 0.3973 (0.4349) time: 6.0581 data: 0.0002 max mem: 71823 -[08:38:42.291374] Epoch: [3] [1200/1624] lr: 0.000006 closs: 0.7231 (0.7401) grad_norm: 0.4023 (0.4349) time: 6.0555 data: 0.0002 max mem: 71823 -[08:39:42.614965] Epoch: [3] [1210/1624] lr: 0.000006 closs: 0.6948 (0.7400) grad_norm: 0.4095 (0.4346) time: 6.0413 data: 0.0002 max mem: 71823 -[08:40:43.014991] Epoch: [3] [1220/1624] lr: 0.000006 closs: 0.7240 (0.7402) grad_norm: 0.4121 (0.4347) time: 6.0361 data: 0.0002 max mem: 71823 -[08:41:43.511347] Epoch: [3] [1230/1624] lr: 0.000006 closs: 0.7780 (0.7404) grad_norm: 0.4121 (0.4350) time: 6.0447 data: 0.0002 max mem: 71823 -[08:42:43.890515] Epoch: [3] [1240/1624] lr: 0.000006 closs: 0.7611 (0.7403) grad_norm: 0.4093 (0.4349) time: 6.0436 data: 0.0002 max mem: 71823 -[08:43:44.397817] Epoch: [3] [1250/1624] lr: 0.000006 closs: 0.6760 (0.7398) grad_norm: 0.4253 (0.4348) time: 6.0442 data: 0.0002 max mem: 71823 -[08:44:44.829848] Epoch: [3] [1260/1624] lr: 0.000006 closs: 0.6336 (0.7391) grad_norm: 0.4253 (0.4351) time: 6.0468 data: 0.0002 max mem: 71823 -[08:45:45.333081] Epoch: [3] [1270/1624] lr: 0.000006 closs: 0.6460 (0.7384) grad_norm: 0.4148 (0.4353) time: 6.0466 data: 0.0002 max mem: 71823 -[08:46:46.032063] Epoch: [3] [1280/1624] lr: 0.000006 closs: 0.6806 (0.7380) grad_norm: 0.4150 (0.4351) time: 6.0600 data: 0.0002 max mem: 71823 -[08:47:46.354993] Epoch: [3] [1290/1624] lr: 0.000006 closs: 0.7550 (0.7380) grad_norm: 0.4148 (0.4355) time: 6.0510 data: 0.0002 max mem: 71823 -[08:48:46.909188] Epoch: [3] [1300/1624] lr: 0.000005 closs: 0.7596 (0.7380) grad_norm: 0.3993 (0.4358) time: 6.0437 data: 0.0002 max mem: 71823 -[08:49:47.328852] Epoch: [3] [1310/1624] lr: 0.000005 closs: 0.7242 (0.7379) grad_norm: 0.4052 (0.4361) time: 6.0486 data: 0.0002 max mem: 71823 -[08:50:47.797474] Epoch: [3] [1320/1624] lr: 0.000005 closs: 0.7242 (0.7381) grad_norm: 0.4052 (0.4361) time: 6.0443 data: 0.0002 max mem: 71823 -[08:51:48.403805] Epoch: [3] [1330/1624] lr: 0.000005 closs: 0.6615 (0.7374) grad_norm: 0.4096 (0.4359) time: 6.0536 data: 0.0002 max mem: 71823 -[08:52:48.858634] Epoch: [3] [1340/1624] lr: 0.000005 closs: 0.6514 (0.7376) grad_norm: 0.4378 (0.4366) time: 6.0529 data: 0.0002 max mem: 71823 -[08:53:49.436905] Epoch: [3] [1350/1624] lr: 0.000005 closs: 0.7221 (0.7379) grad_norm: 0.4403 (0.4521) time: 6.0515 data: 0.0002 max mem: 71823 -[08:54:50.077791] Epoch: [3] [1360/1624] lr: 0.000005 closs: 0.7783 (0.7381) grad_norm: 0.4320 (0.4517) time: 6.0608 data: 0.0002 max mem: 71823 -[08:55:50.548569] Epoch: [3] [1370/1624] lr: 0.000005 closs: 0.6971 (0.7377) grad_norm: 0.4320 (0.4516) time: 6.0555 data: 0.0002 max mem: 71823 -[08:56:51.091302] Epoch: [3] [1380/1624] lr: 0.000005 closs: 0.7008 (0.7384) grad_norm: 0.3892 (0.4514) time: 6.0505 data: 0.0002 max mem: 71823 -[08:57:51.511020] Epoch: [3] [1390/1624] lr: 0.000005 closs: 0.7050 (0.7380) grad_norm: 0.3953 (0.4513) time: 6.0480 data: 0.0002 max mem: 71823 -[08:58:51.910834] Epoch: [3] [1400/1624] lr: 0.000005 closs: 0.6986 (0.7378) grad_norm: 0.3867 (0.4508) time: 6.0408 data: 0.0002 max mem: 71823 -[08:59:52.404603] Epoch: [3] [1410/1624] lr: 0.000005 closs: 0.7451 (0.7381) grad_norm: 0.4042 (0.4510) time: 6.0446 data: 0.0002 max mem: 71823 -[09:00:52.989508] Epoch: [3] [1420/1624] lr: 0.000005 closs: 0.7467 (0.7379) grad_norm: 0.4074 (0.4510) time: 6.0538 data: 0.0002 max mem: 71823 -[09:01:53.420389] Epoch: [3] [1430/1624] lr: 0.000005 closs: 0.7522 (0.7392) grad_norm: 0.4074 (0.4510) time: 6.0507 data: 0.0002 max mem: 71823 -[09:02:54.055985] Epoch: [3] [1440/1624] lr: 0.000005 closs: 0.7851 (0.7394) grad_norm: 0.4216 (0.4514) time: 6.0532 data: 0.0002 max mem: 71823 -[09:03:54.267314] Epoch: [3] [1450/1624] lr: 0.000005 closs: 0.7260 (0.7393) grad_norm: 0.4035 (0.4510) time: 6.0422 data: 0.0002 max mem: 71823 -[09:04:54.716911] Epoch: [3] [1460/1624] lr: 0.000005 closs: 0.7328 (0.7394) grad_norm: 0.4301 (0.4518) time: 6.0329 data: 0.0002 max mem: 71823 -[09:05:55.239016] Epoch: [3] [1470/1624] lr: 0.000005 closs: 0.7657 (0.7398) grad_norm: 0.4323 (0.4518) time: 6.0484 data: 0.0002 max mem: 71823 -[09:06:55.625606] Epoch: [3] [1480/1624] lr: 0.000005 closs: 0.7657 (0.7397) grad_norm: 0.4323 (0.4515) time: 6.0453 data: 0.0002 max mem: 71823 -[09:07:56.167577] Epoch: [3] [1490/1624] lr: 0.000005 closs: 0.7271 (0.7394) grad_norm: 0.4384 (0.4522) time: 6.0463 data: 0.0002 max mem: 71823 -[09:08:56.752648] Epoch: [3] [1500/1624] lr: 0.000005 closs: 0.7213 (0.7392) grad_norm: 0.4323 (0.4524) time: 6.0562 data: 0.0002 max mem: 71823 -[09:09:57.241299] Epoch: [3] [1510/1624] lr: 0.000005 closs: 0.7213 (0.7394) grad_norm: 0.4210 (0.4525) time: 6.0536 data: 0.0002 max mem: 71823 -[09:10:57.805490] Epoch: [3] [1520/1624] lr: 0.000005 closs: 0.7446 (0.7394) grad_norm: 0.4231 (0.4523) time: 6.0525 data: 0.0002 max mem: 71823 -[09:11:58.172894] Epoch: [3] [1530/1624] lr: 0.000005 closs: 0.7576 (0.7396) grad_norm: 0.3988 (0.4519) time: 6.0464 data: 0.0002 max mem: 71823 -[09:12:58.526472] Epoch: [3] [1540/1624] lr: 0.000005 closs: 0.7306 (0.7398) grad_norm: 0.4210 (0.4519) time: 6.0359 data: 0.0002 max mem: 71823 -[09:13:58.978036] Epoch: [3] [1550/1624] lr: 0.000005 closs: 0.7349 (0.7397) grad_norm: 0.4114 (0.4517) time: 6.0401 data: 0.0002 max mem: 71823 -[09:14:59.646430] Epoch: [3] [1560/1624] lr: 0.000005 closs: 0.7238 (0.7396) grad_norm: 0.4120 (0.4516) time: 6.0559 data: 0.0002 max mem: 71823 -[09:16:00.191728] Epoch: [3] [1570/1624] lr: 0.000005 closs: 0.7019 (0.7392) grad_norm: 0.4264 (0.4514) time: 6.0606 data: 0.0002 max mem: 71823 -[09:17:00.713642] Epoch: [3] [1580/1624] lr: 0.000005 closs: 0.7117 (0.7393) grad_norm: 0.4057 (0.4512) time: 6.0532 data: 0.0002 max mem: 71823 -[09:18:01.252343] Epoch: [3] [1590/1624] lr: 0.000005 closs: 0.7379 (0.7396) grad_norm: 0.4039 (0.4509) time: 6.0529 data: 0.0002 max mem: 71823 -[09:19:01.956294] Epoch: [3] [1600/1624] lr: 0.000005 closs: 0.8150 (0.7401) grad_norm: 0.4026 (0.4513) time: 6.0620 data: 0.0002 max mem: 71823 -[09:20:02.308941] Epoch: [3] [1610/1624] lr: 0.000005 closs: 0.8374 (0.7402) grad_norm: 0.4173 (0.4513) time: 6.0527 data: 0.0002 max mem: 71823 -[09:21:02.722758] Epoch: [3] [1620/1624] lr: 0.000005 closs: 0.7011 (0.7399) grad_norm: 0.4234 (0.4513) time: 6.0382 data: 0.0002 max mem: 71823 -[09:21:21.564692] Epoch: [3] Total time: 2:43:47 -[09:21:21.669617] Averaged stats: lr: 0.000005 closs: 0.7123 (0.7438) grad_norm: 0.4234 (0.4512) -[09:21:22.673612] model saved -[09:21:27.199683] optimizer saved -[09:21:27.200535] other rank-common saved -[09:21:27.207623] rank-specific saved -[09:21:27.207809] Training time 10:55:37 diff --git a/finetune/sg/alpaca_llamaPeft_normBias_QF_512_70B/epoch0/consolidated.00-of-01.model.pth b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_70B/epoch0/consolidated.00-of-01.model.pth deleted file mode 100644 index 8f3ba8a74edeb56d30f33b93229d72ff0893234d..0000000000000000000000000000000000000000 --- a/finetune/sg/alpaca_llamaPeft_normBias_QF_512_70B/epoch0/consolidated.00-of-01.model.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:57840f6a94b317e5216383955e671d5c485aa459e9a77c46399004b14aab3449 -size 16308187 diff --git a/finetune/sg/alpaca_llamaPeft_normBias_QF_512_70B/epoch0/consolidated.00-of-01.optimizer.pth b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_70B/epoch0/consolidated.00-of-01.optimizer.pth deleted file mode 100644 index e0aa9f434f0cc0a821265ec66357c81590a16153..0000000000000000000000000000000000000000 --- a/finetune/sg/alpaca_llamaPeft_normBias_QF_512_70B/epoch0/consolidated.00-of-01.optimizer.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:98e48515afe7b737d914e6a945fddc635b1f3bd53d716da73568279b19d4e9fc -size 64801559 diff --git a/finetune/sg/alpaca_llamaPeft_normBias_QF_512_70B/epoch0/consolidated.00-of-01.other.pth b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_70B/epoch0/consolidated.00-of-01.other.pth deleted file mode 100644 index 6c3e73a9d547910fb183caa16be9b2f9ed5a7866..0000000000000000000000000000000000000000 --- a/finetune/sg/alpaca_llamaPeft_normBias_QF_512_70B/epoch0/consolidated.00-of-01.other.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:15ad86707b40e58d9f1e5b304e7f138c74be797e45cdcd6c9c3e67d1ddea2a8b -size 1687 diff --git a/finetune/sg/alpaca_llamaPeft_normBias_QF_512_70B/epoch0/rank-specific-00000-of-00002.pth b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_70B/epoch0/rank-specific-00000-of-00002.pth deleted file mode 100644 index d5700f48e3c878c8ea005c25793d536c052e44be..0000000000000000000000000000000000000000 --- a/finetune/sg/alpaca_llamaPeft_normBias_QF_512_70B/epoch0/rank-specific-00000-of-00002.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:773a3b4cf6877fcfb087e3efb3b631fc40b16a6fdef5b9afb3cd6854bc59509f -size 537 diff --git a/finetune/sg/alpaca_llamaPeft_normBias_QF_512_70B/epoch0/rank-specific-00001-of-00002.pth b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_70B/epoch0/rank-specific-00001-of-00002.pth deleted file mode 100644 index 695871058b022437de6682dd3f0b8db0055fc6fa..0000000000000000000000000000000000000000 --- a/finetune/sg/alpaca_llamaPeft_normBias_QF_512_70B/epoch0/rank-specific-00001-of-00002.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:9baaa2fa1f453e7261aad9ed636db8f4395edcee0ada8daad1f8078b40d5f61c -size 537 diff --git a/finetune/sg/alpaca_llamaPeft_normBias_QF_512_70B/epoch1/consolidated.00-of-01.model.pth b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_70B/epoch1/consolidated.00-of-01.model.pth deleted file mode 100644 index bf8b0b7d0020d9c161451e9628f04abe5fbbfd7d..0000000000000000000000000000000000000000 --- a/finetune/sg/alpaca_llamaPeft_normBias_QF_512_70B/epoch1/consolidated.00-of-01.model.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:95c28c92604c98733f7d11de93aabb7b91bf51cf6d5d1b4a7648f88735df9be8 -size 16308187 diff --git a/finetune/sg/alpaca_llamaPeft_normBias_QF_512_70B/epoch1/consolidated.00-of-01.optimizer.pth b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_70B/epoch1/consolidated.00-of-01.optimizer.pth deleted file mode 100644 index 4ae586e8704515a765d6489bf6c23b882d96dd76..0000000000000000000000000000000000000000 --- a/finetune/sg/alpaca_llamaPeft_normBias_QF_512_70B/epoch1/consolidated.00-of-01.optimizer.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:726e9afa5e22acbf9e238aaa616537ff8bc766c7a407d50f49fd184a85596d3f -size 64801559 diff --git a/finetune/sg/alpaca_llamaPeft_normBias_QF_512_70B/epoch1/consolidated.00-of-01.other.pth b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_70B/epoch1/consolidated.00-of-01.other.pth deleted file mode 100644 index db1a4f53cdd71b119a332dfc0460aacb6dcba83d..0000000000000000000000000000000000000000 --- a/finetune/sg/alpaca_llamaPeft_normBias_QF_512_70B/epoch1/consolidated.00-of-01.other.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:e3ffbc6452c328bba47a43a12dbd4bc293c231de8f57a6bd819aa611ed703d60 -size 1687 diff --git a/finetune/sg/alpaca_llamaPeft_normBias_QF_512_70B/epoch1/rank-specific-00000-of-00002.pth b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_70B/epoch1/rank-specific-00000-of-00002.pth deleted file mode 100644 index d5700f48e3c878c8ea005c25793d536c052e44be..0000000000000000000000000000000000000000 --- a/finetune/sg/alpaca_llamaPeft_normBias_QF_512_70B/epoch1/rank-specific-00000-of-00002.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:773a3b4cf6877fcfb087e3efb3b631fc40b16a6fdef5b9afb3cd6854bc59509f -size 537 diff --git a/finetune/sg/alpaca_llamaPeft_normBias_QF_512_70B/epoch1/rank-specific-00001-of-00002.pth b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_70B/epoch1/rank-specific-00001-of-00002.pth deleted file mode 100644 index 695871058b022437de6682dd3f0b8db0055fc6fa..0000000000000000000000000000000000000000 --- a/finetune/sg/alpaca_llamaPeft_normBias_QF_512_70B/epoch1/rank-specific-00001-of-00002.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:9baaa2fa1f453e7261aad9ed636db8f4395edcee0ada8daad1f8078b40d5f61c -size 537 diff --git a/finetune/sg/alpaca_llamaPeft_normBias_QF_512_70B/epoch2/consolidated.00-of-01.model.pth b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_70B/epoch2/consolidated.00-of-01.model.pth deleted file mode 100644 index 0b6b9e7e744aa00e7901cd325ae5b2ca70edd02f..0000000000000000000000000000000000000000 --- a/finetune/sg/alpaca_llamaPeft_normBias_QF_512_70B/epoch2/consolidated.00-of-01.model.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:71c909bd4009747dcecd359334b72083ce7d70ba611d0835b3f9e805633df345 -size 16308187 diff --git a/finetune/sg/alpaca_llamaPeft_normBias_QF_512_70B/epoch2/consolidated.00-of-01.optimizer.pth b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_70B/epoch2/consolidated.00-of-01.optimizer.pth deleted file mode 100644 index 8209a0373966755123d2c2a67c9f7f3716dad120..0000000000000000000000000000000000000000 --- a/finetune/sg/alpaca_llamaPeft_normBias_QF_512_70B/epoch2/consolidated.00-of-01.optimizer.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:8af183f8bb45d2e6aecf19b37dd23b380900961ab1df49a422c47a88086dd99b -size 64801559 diff --git a/finetune/sg/alpaca_llamaPeft_normBias_QF_512_70B/epoch2/consolidated.00-of-01.other.pth b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_70B/epoch2/consolidated.00-of-01.other.pth deleted file mode 100644 index 654e95a72619e275667b7e2858b830b4eb336d60..0000000000000000000000000000000000000000 --- a/finetune/sg/alpaca_llamaPeft_normBias_QF_512_70B/epoch2/consolidated.00-of-01.other.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:4e1407e17b55be720204de47db0e9d89c18253e4bd99ce9beecf96812ad9220b -size 1687 diff --git a/finetune/sg/alpaca_llamaPeft_normBias_QF_512_70B/epoch2/rank-specific-00000-of-00002.pth b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_70B/epoch2/rank-specific-00000-of-00002.pth deleted file mode 100644 index d5700f48e3c878c8ea005c25793d536c052e44be..0000000000000000000000000000000000000000 --- a/finetune/sg/alpaca_llamaPeft_normBias_QF_512_70B/epoch2/rank-specific-00000-of-00002.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:773a3b4cf6877fcfb087e3efb3b631fc40b16a6fdef5b9afb3cd6854bc59509f -size 537 diff --git a/finetune/sg/alpaca_llamaPeft_normBias_QF_512_70B/epoch2/rank-specific-00001-of-00002.pth b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_70B/epoch2/rank-specific-00001-of-00002.pth deleted file mode 100644 index 695871058b022437de6682dd3f0b8db0055fc6fa..0000000000000000000000000000000000000000 --- a/finetune/sg/alpaca_llamaPeft_normBias_QF_512_70B/epoch2/rank-specific-00001-of-00002.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:9baaa2fa1f453e7261aad9ed636db8f4395edcee0ada8daad1f8078b40d5f61c -size 537 diff --git a/finetune/sg/alpaca_llamaPeft_normBias_QF_512_70B/epoch3/consolidated.00-of-01.model.pth b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_70B/epoch3/consolidated.00-of-01.model.pth deleted file mode 100644 index fdb8a6396561e7587d5763107287361c3b7d2a65..0000000000000000000000000000000000000000 --- a/finetune/sg/alpaca_llamaPeft_normBias_QF_512_70B/epoch3/consolidated.00-of-01.model.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:1ca963a2250f78056df0749f6d91daf572f7e87a00398a2ea04fb8e0d4fb2981 -size 16308187 diff --git a/finetune/sg/alpaca_llamaPeft_normBias_QF_512_70B/epoch3/consolidated.00-of-01.optimizer.pth b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_70B/epoch3/consolidated.00-of-01.optimizer.pth deleted file mode 100644 index 28bd47df51e7ce248d00ef3d9ee2ca5cd4821c25..0000000000000000000000000000000000000000 --- a/finetune/sg/alpaca_llamaPeft_normBias_QF_512_70B/epoch3/consolidated.00-of-01.optimizer.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:d8f6388325fba6658d8f5f383cb6affc164930411672f0944d7f249e10a03b78 -size 64801559 diff --git a/finetune/sg/alpaca_llamaPeft_normBias_QF_512_70B/epoch3/consolidated.00-of-01.other.pth b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_70B/epoch3/consolidated.00-of-01.other.pth deleted file mode 100644 index d5680517ef1395e38ea1a66b086e2213b58700f0..0000000000000000000000000000000000000000 --- a/finetune/sg/alpaca_llamaPeft_normBias_QF_512_70B/epoch3/consolidated.00-of-01.other.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:99c45bf04236026e6ce60807e96c85d649fb32f7065883be1052037d816478dc -size 1687 diff --git a/finetune/sg/alpaca_llamaPeft_normBias_QF_512_70B/epoch3/rank-specific-00000-of-00002.pth b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_70B/epoch3/rank-specific-00000-of-00002.pth deleted file mode 100644 index d5700f48e3c878c8ea005c25793d536c052e44be..0000000000000000000000000000000000000000 --- a/finetune/sg/alpaca_llamaPeft_normBias_QF_512_70B/epoch3/rank-specific-00000-of-00002.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:773a3b4cf6877fcfb087e3efb3b631fc40b16a6fdef5b9afb3cd6854bc59509f -size 537 diff --git a/finetune/sg/alpaca_llamaPeft_normBias_QF_512_70B/epoch3/rank-specific-00001-of-00002.pth b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_70B/epoch3/rank-specific-00001-of-00002.pth deleted file mode 100644 index 695871058b022437de6682dd3f0b8db0055fc6fa..0000000000000000000000000000000000000000 --- a/finetune/sg/alpaca_llamaPeft_normBias_QF_512_70B/epoch3/rank-specific-00001-of-00002.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:9baaa2fa1f453e7261aad9ed636db8f4395edcee0ada8daad1f8078b40d5f61c -size 537 diff --git a/finetune/sg/alpaca_llamaPeft_normBias_QF_512_70B/log.txt b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_70B/log.txt deleted file mode 100644 index e2ec26280e83c14eb2d09d4d2d0ccd17ebb3ede7..0000000000000000000000000000000000000000 --- a/finetune/sg/alpaca_llamaPeft_normBias_QF_512_70B/log.txt +++ /dev/null @@ -1,4 +0,0 @@ -{"train_lr": 2.49923076923077e-05, "train_closs": 0.7980487976441016, "train_grad_norm": 0.5980251384056532, "epoch": 0, "val_lr": 2.49923076923077e-05, "val_closs": 0.7980487976441016, "val_grad_norm": 0.5980251384056532} -{"train_lr": 4.6109080828728024e-05, "train_closs": 0.7623572307492678, "train_grad_norm": 0.45453824085914174, "epoch": 1, "val_lr": 4.6109080828728024e-05, "val_closs": 0.7623572307492678, "val_grad_norm": 0.45453824085914174} -{"train_lr": 2.750346153846151e-05, "train_closs": 0.750338752788993, "train_grad_norm": 0.46191218195511746, "epoch": 2, "val_lr": 2.750346153846151e-05, "val_closs": 0.750338752788993, "val_grad_norm": 0.46191218195511746} -{"train_lr": 8.894380709733404e-06, "train_closs": 0.742047518081963, "train_grad_norm": 0.47685301401064945, "epoch": 3, "val_lr": 8.894380709733404e-06, "val_closs": 0.742047518081963, "val_grad_norm": 0.47685301401064945} diff --git a/finetune/sg/alpaca_llamaPeft_normBias_QF_512_70B/output.log b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_70B/output.log deleted file mode 100644 index 21811241a7b4e3fced1b177e3d157ea8d3383e84..0000000000000000000000000000000000000000 --- a/finetune/sg/alpaca_llamaPeft_normBias_QF_512_70B/output.log +++ /dev/null @@ -1,3895 +0,0 @@ -WARNING:torch.distributed.run: -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -| distributed init (rank 1): env://, gpu 1 -| distributed init (rank 0): env://, gpu 0 -[19:19:28.076271] > initializing model parallel with size 1 -[19:19:28.076336] > initializing ddp with size 2 -[19:19:28.076342] > initializing pipeline with size 1 -[19:19:28.121707] job dir: /data/liuyijiang/mmlab/krisliu/LLaMA2-Accessory/accessory -[19:19:28.121800] Namespace(batch_size=4, -accum_iter=2, -llama_type='llama_peft', -llama_config=['../checkpoints/llama2/Llama-2-70b/params.json'], -no_visual=True, -tokenizer_path='../checkpoints/llama2/Llama-2-70b/tokenizer.model', -pretrained_path='../checkpoints/llama2/Llama-2-70b/', -pretrained_type='meta_ori', -weight_decay=0.02, -lr=5e-05, -min_lr=5e-06, -epochs=4, -warmup_epochs=1.0, -clip_grad=2, -max_words=512, -dialog=False, -data_config='configs/data/finetune/sg/alpaca.yaml', -output_dir='output/finetune/sg/alpaca_llamaPeft_normBias_QF_512_70B', -log_dir='./output_dir', -save_interval=1, -device='cuda', -seed=0, -resume='', -num_workers=8, -pin_mem=True, -world_size=2, -local_rank=-1, -dist_on_itp=False, -dist_url='env://', -model_parallel_size=1, -data_parallel='sdp', -precision='bf16', -checkpointing=True, -quant=True, -rank=0, -gpu=0, -distributed=True, -dist_backend='nccl') -[19:19:28.133114] Model Args: - ModelArgs(dim=8192, n_layers=80, n_heads=64, n_kv_heads=8, vocab_size=32000, multiple_of=4096, ffn_dim_multiplier=1.3, norm_eps=1e-05, max_batch_size=32, max_seq_len=512, lora_rank=-1, bias_tuning=True) -[19:19:48.469330] Epoch: [0] [1250/6500] lr: 0.000010 closs: 0.7255 (0.8696) grad_norm: 0.6067 (0.8306) time: 5.5775 data: 0.0002 max mem: 71357 -[19:20:44.159763] Epoch: [0] [1260/6500] lr: 0.000010 closs: 0.7255 (0.8691) grad_norm: 0.6007 (0.8286) time: 5.5761 data: 0.0002 max mem: 71357 -[19:21:39.944680] Epoch: [0] [1270/6500] lr: 0.000010 closs: 0.8232 (0.8687) grad_norm: 0.5559 (0.8273) time: 5.5737 data: 0.0002 max mem: 71357 -[19:22:35.727072] Epoch: [0] [1280/6500] lr: 0.000010 closs: 0.7722 (0.8681) grad_norm: 0.6128 (0.8262) time: 5.5782 data: 0.0002 max mem: 71357 -[19:23:31.545412] Epoch: [0] [1290/6500] lr: 0.000010 closs: 0.7420 (0.8668) grad_norm: 0.5559 (0.8239) time: 5.5799 data: 0.0002 max mem: 71357 -[19:24:27.267608] Epoch: [0] [1300/6500] lr: 0.000010 closs: 0.7926 (0.8669) grad_norm: 0.6245 (0.8228) time: 5.5769 data: 0.0002 max mem: 71357 -[19:25:23.128280] Epoch: [0] [1310/6500] lr: 0.000010 closs: 0.7852 (0.8662) grad_norm: 0.6224 (0.8206) time: 5.5791 data: 0.0002 max mem: 71357 -[19:26:18.848671] Epoch: [0] [1320/6500] lr: 0.000010 closs: 0.7704 (0.8660) grad_norm: 0.5277 (0.8185) time: 5.5789 data: 0.0002 max mem: 71357 -[19:27:14.498952] Epoch: [0] [1330/6500] lr: 0.000010 closs: 0.8118 (0.8657) grad_norm: 0.5535 (0.8175) time: 5.5684 data: 0.0002 max mem: 71357 -[19:27:29.243249] Model is Peft: True -[19:27:29.250590] Trainable parameter count : 8036352 (local rank), 8036352 (all). -[19:27:29.274209] Trainable param: llma.tok_embeddings.weight, local_size: torch.Size([32000, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.274240] Trainable param: llma.layers.0.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.274253] Trainable param: llma.layers.0.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 -[19:27:29.274266] Trainable param: llma.layers.0.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.274276] Trainable param: llma.layers.0.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 -[19:27:29.274288] Trainable param: llma.layers.0.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.274299] Trainable param: llma.layers.0.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 -[19:27:29.274310] Trainable param: llma.layers.0.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.274320] Trainable param: llma.layers.0.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.274333] Trainable param: llma.layers.0.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.274343] Trainable param: llma.layers.0.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 -[19:27:29.274355] Trainable param: llma.layers.0.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.274365] Trainable param: llma.layers.0.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.274376] Trainable param: llma.layers.0.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.274386] Trainable param: llma.layers.0.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 -[19:27:29.274397] Trainable param: llma.layers.0.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.274408] Trainable param: llma.layers.0.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.274423] Trainable param: llma.layers.1.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.274435] Trainable param: llma.layers.1.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 -[19:27:29.274446] Trainable param: llma.layers.1.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.274456] Trainable param: llma.layers.1.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 -[19:27:29.274468] Trainable param: llma.layers.1.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.274478] Trainable param: llma.layers.1.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 -[19:27:29.274489] Trainable param: llma.layers.1.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.274499] Trainable param: llma.layers.1.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.274511] Trainable param: llma.layers.1.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.274521] Trainable param: llma.layers.1.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 -[19:27:29.274532] Trainable param: llma.layers.1.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.274542] Trainable param: llma.layers.1.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.274553] Trainable param: llma.layers.1.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.274563] Trainable param: llma.layers.1.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 -[19:27:29.274574] Trainable param: llma.layers.1.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.274585] Trainable param: llma.layers.1.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.274599] Trainable param: llma.layers.2.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.274609] Trainable param: llma.layers.2.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 -[19:27:29.274620] Trainable param: llma.layers.2.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.274630] Trainable param: llma.layers.2.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 -[19:27:29.274641] Trainable param: llma.layers.2.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.274651] Trainable param: llma.layers.2.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 -[19:27:29.274662] Trainable param: llma.layers.2.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.274672] Trainable param: llma.layers.2.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.274684] Trainable param: llma.layers.2.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.274694] Trainable param: llma.layers.2.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 -[19:27:29.274705] Trainable param: llma.layers.2.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.274715] Trainable param: llma.layers.2.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.274726] Trainable param: llma.layers.2.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.274736] Trainable param: llma.layers.2.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 -[19:27:29.274748] Trainable param: llma.layers.2.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.274759] Trainable param: llma.layers.2.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.274772] Trainable param: llma.layers.3.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.274782] Trainable param: llma.layers.3.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 -[19:27:29.274793] Trainable param: llma.layers.3.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.274803] Trainable param: llma.layers.3.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 -[19:27:29.274814] Trainable param: llma.layers.3.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.274824] Trainable param: llma.layers.3.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 -[19:27:29.274835] Trainable param: llma.layers.3.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.274845] Trainable param: llma.layers.3.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.274857] Trainable param: llma.layers.3.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.274867] Trainable param: llma.layers.3.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 -[19:27:29.274878] Trainable param: llma.layers.3.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.274888] Trainable param: llma.layers.3.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.274900] Trainable param: llma.layers.3.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.274909] Trainable param: llma.layers.3.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 -[19:27:29.274921] Trainable param: llma.layers.3.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.274931] Trainable param: llma.layers.3.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.274945] Trainable param: llma.layers.4.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.274954] Trainable param: llma.layers.4.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 -[19:27:29.274966] Trainable param: llma.layers.4.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.274976] Trainable param: llma.layers.4.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 -[19:27:29.274987] Trainable param: llma.layers.4.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.274997] Trainable param: llma.layers.4.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 -[19:27:29.275008] Trainable param: llma.layers.4.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.275018] Trainable param: llma.layers.4.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.275030] Trainable param: llma.layers.4.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.275040] Trainable param: llma.layers.4.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 -[19:27:29.275052] Trainable param: llma.layers.4.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.275063] Trainable param: llma.layers.4.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.275075] Trainable param: llma.layers.4.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.275085] Trainable param: llma.layers.4.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 -[19:27:29.275096] Trainable param: llma.layers.4.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.275107] Trainable param: llma.layers.4.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.275120] Trainable param: llma.layers.5.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.275130] Trainable param: llma.layers.5.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 -[19:27:29.275142] Trainable param: llma.layers.5.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.275151] Trainable param: llma.layers.5.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 -[19:27:29.275163] Trainable param: llma.layers.5.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.275172] Trainable param: llma.layers.5.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 -[19:27:29.275184] Trainable param: llma.layers.5.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.275194] Trainable param: llma.layers.5.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.275206] Trainable param: llma.layers.5.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.275215] Trainable param: llma.layers.5.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 -[19:27:29.275227] Trainable param: llma.layers.5.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.275237] Trainable param: llma.layers.5.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.275248] Trainable param: llma.layers.5.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.275258] Trainable param: llma.layers.5.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 -[19:27:29.275269] Trainable param: llma.layers.5.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.275282] Trainable param: llma.layers.5.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.275295] Trainable param: llma.layers.6.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.275305] Trainable param: llma.layers.6.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 -[19:27:29.275316] Trainable param: llma.layers.6.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.275326] Trainable param: llma.layers.6.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 -[19:27:29.275338] Trainable param: llma.layers.6.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.275347] Trainable param: llma.layers.6.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 -[19:27:29.275359] Trainable param: llma.layers.6.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.275368] Trainable param: llma.layers.6.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.275380] Trainable param: llma.layers.6.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.275390] Trainable param: llma.layers.6.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 -[19:27:29.275401] Trainable param: llma.layers.6.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.275411] Trainable param: llma.layers.6.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.275423] Trainable param: llma.layers.6.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.275432] Trainable param: llma.layers.6.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 -[19:27:29.275443] Trainable param: llma.layers.6.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.275454] Trainable param: llma.layers.6.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.275467] Trainable param: llma.layers.7.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.275477] Trainable param: llma.layers.7.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 -[19:27:29.275488] Trainable param: llma.layers.7.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.275498] Trainable param: llma.layers.7.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 -[19:27:29.275509] Trainable param: llma.layers.7.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.275519] Trainable param: llma.layers.7.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 -[19:27:29.275531] Trainable param: llma.layers.7.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.275540] Trainable param: llma.layers.7.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.275553] Trainable param: llma.layers.7.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.275563] Trainable param: llma.layers.7.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 -[19:27:29.275574] Trainable param: llma.layers.7.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.275584] Trainable param: llma.layers.7.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.275595] Trainable param: llma.layers.7.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.275605] Trainable param: llma.layers.7.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 -[19:27:29.275616] Trainable param: llma.layers.7.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.275627] Trainable param: llma.layers.7.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.275640] Trainable param: llma.layers.8.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.275650] Trainable param: llma.layers.8.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 -[19:27:29.275661] Trainable param: llma.layers.8.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.275675] Trainable param: llma.layers.8.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 -[19:27:29.275692] Trainable param: llma.layers.8.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.275703] Trainable param: llma.layers.8.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 -[19:27:29.275714] Trainable param: llma.layers.8.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.275724] Trainable param: llma.layers.8.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.275736] Trainable param: llma.layers.8.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.275746] Trainable param: llma.layers.8.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 -[19:27:29.275758] Trainable param: llma.layers.8.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.275768] Trainable param: llma.layers.8.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.275779] Trainable param: llma.layers.8.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.275789] Trainable param: llma.layers.8.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 -[19:27:29.275800] Trainable param: llma.layers.8.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.275812] Trainable param: llma.layers.8.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.275825] Trainable param: llma.layers.9.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.275835] Trainable param: llma.layers.9.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 -[19:27:29.275846] Trainable param: llma.layers.9.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.275856] Trainable param: llma.layers.9.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 -[19:27:29.275868] Trainable param: llma.layers.9.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.275878] Trainable param: llma.layers.9.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 -[19:27:29.275889] Trainable param: llma.layers.9.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.275899] Trainable param: llma.layers.9.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.275911] Trainable param: llma.layers.9.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.275921] Trainable param: llma.layers.9.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 -[19:27:29.275932] Trainable param: llma.layers.9.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.275942] Trainable param: llma.layers.9.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.275954] Trainable param: llma.layers.9.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.275963] Trainable param: llma.layers.9.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 -[19:27:29.275975] Trainable param: llma.layers.9.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.275986] Trainable param: llma.layers.9.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.275999] Trainable param: llma.layers.10.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.276009] Trainable param: llma.layers.10.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 -[19:27:29.276021] Trainable param: llma.layers.10.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.276030] Trainable param: llma.layers.10.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 -[19:27:29.276042] Trainable param: llma.layers.10.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.276051] Trainable param: llma.layers.10.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 -[19:27:29.276063] Trainable param: llma.layers.10.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.276073] Trainable param: llma.layers.10.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.276085] Trainable param: llma.layers.10.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.276095] Trainable param: llma.layers.10.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 -[19:27:29.276106] Trainable param: llma.layers.10.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.276116] Trainable param: llma.layers.10.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.276127] Trainable param: llma.layers.10.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.276137] Trainable param: llma.layers.10.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 -[19:27:29.276148] Trainable param: llma.layers.10.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.276159] Trainable param: llma.layers.10.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.276172] Trainable param: llma.layers.11.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.276182] Trainable param: llma.layers.11.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 -[19:27:29.276194] Trainable param: llma.layers.11.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.276203] Trainable param: llma.layers.11.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 -[19:27:29.276215] Trainable param: llma.layers.11.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.276224] Trainable param: llma.layers.11.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 -[19:27:29.276236] Trainable param: llma.layers.11.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.276245] Trainable param: llma.layers.11.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.276257] Trainable param: llma.layers.11.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.276267] Trainable param: llma.layers.11.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 -[19:27:29.276278] Trainable param: llma.layers.11.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.276288] Trainable param: llma.layers.11.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.276299] Trainable param: llma.layers.11.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.276309] Trainable param: llma.layers.11.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 -[19:27:29.276320] Trainable param: llma.layers.11.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.276331] Trainable param: llma.layers.11.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.276344] Trainable param: llma.layers.12.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.276354] Trainable param: llma.layers.12.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 -[19:27:29.276366] Trainable param: llma.layers.12.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.276375] Trainable param: llma.layers.12.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 -[19:27:29.276387] Trainable param: llma.layers.12.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.276396] Trainable param: llma.layers.12.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 -[19:27:29.276408] Trainable param: llma.layers.12.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.276417] Trainable param: llma.layers.12.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.276430] Trainable param: llma.layers.12.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.276439] Trainable param: llma.layers.12.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 -[19:27:29.276451] Trainable param: llma.layers.12.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.276460] Trainable param: llma.layers.12.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.276472] Trainable param: llma.layers.12.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.276481] Trainable param: llma.layers.12.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 -[19:27:29.276493] Trainable param: llma.layers.12.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.276504] Trainable param: llma.layers.12.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.276516] Trainable param: llma.layers.13.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.276526] Trainable param: llma.layers.13.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 -[19:27:29.276538] Trainable param: llma.layers.13.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.276547] Trainable param: llma.layers.13.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 -[19:27:29.276559] Trainable param: llma.layers.13.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.276568] Trainable param: llma.layers.13.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 -[19:27:29.276580] Trainable param: llma.layers.13.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.276590] Trainable param: llma.layers.13.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.276602] Trainable param: llma.layers.13.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.276611] Trainable param: llma.layers.13.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 -[19:27:29.276623] Trainable param: llma.layers.13.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.276633] Trainable param: llma.layers.13.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.276644] Trainable param: llma.layers.13.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.276654] Trainable param: llma.layers.13.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 -[19:27:29.276665] Trainable param: llma.layers.13.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.276676] Trainable param: llma.layers.13.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.276689] Trainable param: llma.layers.14.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.276699] Trainable param: llma.layers.14.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 -[19:27:29.276710] Trainable param: llma.layers.14.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.276720] Trainable param: llma.layers.14.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 -[19:27:29.276731] Trainable param: llma.layers.14.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.276741] Trainable param: llma.layers.14.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 -[19:27:29.276752] Trainable param: llma.layers.14.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.276762] Trainable param: llma.layers.14.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.276774] Trainable param: llma.layers.14.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.276784] Trainable param: llma.layers.14.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 -[19:27:29.276795] Trainable param: llma.layers.14.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.276805] Trainable param: llma.layers.14.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.276816] Trainable param: llma.layers.14.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.276826] Trainable param: llma.layers.14.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 -[19:27:29.276837] Trainable param: llma.layers.14.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.276848] Trainable param: llma.layers.14.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.276861] Trainable param: llma.layers.15.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.276871] Trainable param: llma.layers.15.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 -[19:27:29.276882] Trainable param: llma.layers.15.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.276892] Trainable param: llma.layers.15.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 -[19:27:29.276903] Trainable param: llma.layers.15.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.276913] Trainable param: llma.layers.15.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 -[19:27:29.276924] Trainable param: llma.layers.15.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.276934] Trainable param: llma.layers.15.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.276947] Trainable param: llma.layers.15.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.276956] Trainable param: llma.layers.15.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 -[19:27:29.276968] Trainable param: llma.layers.15.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.276977] Trainable param: llma.layers.15.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.276989] Trainable param: llma.layers.15.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.276998] Trainable param: llma.layers.15.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 -[19:27:29.277009] Trainable param: llma.layers.15.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.277020] Trainable param: llma.layers.15.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.277033] Trainable param: llma.layers.16.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.277043] Trainable param: llma.layers.16.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 -[19:27:29.277055] Trainable param: llma.layers.16.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.277065] Trainable param: llma.layers.16.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 -[19:27:29.277076] Trainable param: llma.layers.16.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.277086] Trainable param: llma.layers.16.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 -[19:27:29.277097] Trainable param: llma.layers.16.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.277107] Trainable param: llma.layers.16.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.277119] Trainable param: llma.layers.16.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.277129] Trainable param: llma.layers.16.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 -[19:27:29.277140] Trainable param: llma.layers.16.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.277150] Trainable param: llma.layers.16.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.277161] Trainable param: llma.layers.16.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.277171] Trainable param: llma.layers.16.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 -[19:27:29.277182] Trainable param: llma.layers.16.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.277193] Trainable param: llma.layers.16.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.277206] Trainable param: llma.layers.17.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.277216] Trainable param: llma.layers.17.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 -[19:27:29.277227] Trainable param: llma.layers.17.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.277237] Trainable param: llma.layers.17.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 -[19:27:29.277248] Trainable param: llma.layers.17.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.277258] Trainable param: llma.layers.17.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 -[19:27:29.277269] Trainable param: llma.layers.17.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.277279] Trainable param: llma.layers.17.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.277291] Trainable param: llma.layers.17.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.277301] Trainable param: llma.layers.17.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 -[19:27:29.277312] Trainable param: llma.layers.17.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.277322] Trainable param: llma.layers.17.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.277333] Trainable param: llma.layers.17.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.277343] Trainable param: llma.layers.17.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 -[19:27:29.277355] Trainable param: llma.layers.17.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.277365] Trainable param: llma.layers.17.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.277378] Trainable param: llma.layers.18.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.277388] Trainable param: llma.layers.18.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 -[19:27:29.277400] Trainable param: llma.layers.18.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.277409] Trainable param: llma.layers.18.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 -[19:27:29.277421] Trainable param: llma.layers.18.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.277431] Trainable param: llma.layers.18.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 -[19:27:29.277442] Trainable param: llma.layers.18.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.277452] Trainable param: llma.layers.18.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.277464] Trainable param: llma.layers.18.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.277474] Trainable param: llma.layers.18.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 -[19:27:29.277485] Trainable param: llma.layers.18.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.277495] Trainable param: llma.layers.18.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.277506] Trainable param: llma.layers.18.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.277521] Trainable param: llma.layers.18.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 -[19:27:29.277532] Trainable param: llma.layers.18.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.277543] Trainable param: llma.layers.18.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.277556] Trainable param: llma.layers.19.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.277570] Trainable param: llma.layers.19.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 -[19:27:29.277581] Trainable param: llma.layers.19.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.277591] Trainable param: llma.layers.19.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 -[19:27:29.277603] Trainable param: llma.layers.19.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.277613] Trainable param: llma.layers.19.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 -[19:27:29.277624] Trainable param: llma.layers.19.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.277634] Trainable param: llma.layers.19.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.277646] Trainable param: llma.layers.19.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.277656] Trainable param: llma.layers.19.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 -[19:27:29.277667] Trainable param: llma.layers.19.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.277677] Trainable param: llma.layers.19.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.277688] Trainable param: llma.layers.19.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.277698] Trainable param: llma.layers.19.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 -[19:27:29.277709] Trainable param: llma.layers.19.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.277720] Trainable param: llma.layers.19.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.277733] Trainable param: llma.layers.20.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.277743] Trainable param: llma.layers.20.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 -[19:27:29.277754] Trainable param: llma.layers.20.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.277764] Trainable param: llma.layers.20.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 -[19:27:29.277775] Trainable param: llma.layers.20.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.277785] Trainable param: llma.layers.20.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 -[19:27:29.277796] Trainable param: llma.layers.20.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.277806] Trainable param: llma.layers.20.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.277818] Trainable param: llma.layers.20.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.277828] Trainable param: llma.layers.20.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 -[19:27:29.277839] Trainable param: llma.layers.20.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.277849] Trainable param: llma.layers.20.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.277861] Trainable param: llma.layers.20.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.277870] Trainable param: llma.layers.20.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 -[19:27:29.277881] Trainable param: llma.layers.20.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.277892] Trainable param: llma.layers.20.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.277905] Trainable param: llma.layers.21.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.277915] Trainable param: llma.layers.21.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 -[19:27:29.277926] Trainable param: llma.layers.21.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.277936] Trainable param: llma.layers.21.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 -[19:27:29.277947] Trainable param: llma.layers.21.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.277957] Trainable param: llma.layers.21.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 -[19:27:29.277968] Trainable param: llma.layers.21.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.277978] Trainable param: llma.layers.21.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.277990] Trainable param: llma.layers.21.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.278000] Trainable param: llma.layers.21.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 -[19:27:29.278011] Trainable param: llma.layers.21.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.278021] Trainable param: llma.layers.21.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.278032] Trainable param: llma.layers.21.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.278042] Trainable param: llma.layers.21.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 -[19:27:29.278053] Trainable param: llma.layers.21.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.278064] Trainable param: llma.layers.21.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.278077] Trainable param: llma.layers.22.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.278086] Trainable param: llma.layers.22.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 -[19:27:29.278098] Trainable param: llma.layers.22.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.278107] Trainable param: llma.layers.22.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 -[19:27:29.278118] Trainable param: llma.layers.22.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.278128] Trainable param: llma.layers.22.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 -[19:27:29.278139] Trainable param: llma.layers.22.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.278149] Trainable param: llma.layers.22.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.278161] Trainable param: llma.layers.22.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.278171] Trainable param: llma.layers.22.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 -[19:27:29.278182] Trainable param: llma.layers.22.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.278192] Trainable param: llma.layers.22.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.278203] Trainable param: llma.layers.22.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.278213] Trainable param: llma.layers.22.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 -[19:27:29.278224] Trainable param: llma.layers.22.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.278235] Trainable param: llma.layers.22.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.278248] Trainable param: llma.layers.23.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.278258] Trainable param: llma.layers.23.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 -[19:27:29.278269] Trainable param: llma.layers.23.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.278279] Trainable param: llma.layers.23.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 -[19:27:29.278290] Trainable param: llma.layers.23.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.278300] Trainable param: llma.layers.23.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 -[19:27:29.278311] Trainable param: llma.layers.23.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.278321] Trainable param: llma.layers.23.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.278333] Trainable param: llma.layers.23.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.278343] Trainable param: llma.layers.23.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 -[19:27:29.278354] Trainable param: llma.layers.23.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.278364] Trainable param: llma.layers.23.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.278375] Trainable param: llma.layers.23.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.278385] Trainable param: llma.layers.23.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 -[19:27:29.278396] Trainable param: llma.layers.23.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.278407] Trainable param: llma.layers.23.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.278420] Trainable param: llma.layers.24.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.278429] Trainable param: llma.layers.24.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 -[19:27:29.278441] Trainable param: llma.layers.24.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.278451] Trainable param: llma.layers.24.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 -[19:27:29.278462] Trainable param: llma.layers.24.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.278472] Trainable param: llma.layers.24.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 -[19:27:29.278483] Trainable param: llma.layers.24.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.278493] Trainable param: llma.layers.24.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.278505] Trainable param: llma.layers.24.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.278515] Trainable param: llma.layers.24.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 -[19:27:29.278526] Trainable param: llma.layers.24.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.278536] Trainable param: llma.layers.24.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.278547] Trainable param: llma.layers.24.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.278557] Trainable param: llma.layers.24.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 -[19:27:29.278568] Trainable param: llma.layers.24.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.278579] Trainable param: llma.layers.24.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.278595] Trainable param: llma.layers.25.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.278605] Trainable param: llma.layers.25.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 -[19:27:29.278616] Trainable param: llma.layers.25.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.278626] Trainable param: llma.layers.25.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 -[19:27:29.278637] Trainable param: llma.layers.25.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.278647] Trainable param: llma.layers.25.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 -[19:27:29.278658] Trainable param: llma.layers.25.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.278668] Trainable param: llma.layers.25.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.278680] Trainable param: llma.layers.25.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.278690] Trainable param: llma.layers.25.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 -[19:27:29.278701] Trainable param: llma.layers.25.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.278711] Trainable param: llma.layers.25.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.278722] Trainable param: llma.layers.25.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.278732] Trainable param: llma.layers.25.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 -[19:27:29.278743] Trainable param: llma.layers.25.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.278754] Trainable param: llma.layers.25.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.278767] Trainable param: llma.layers.26.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.278776] Trainable param: llma.layers.26.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 -[19:27:29.278788] Trainable param: llma.layers.26.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.278797] Trainable param: llma.layers.26.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 -[19:27:29.278809] Trainable param: llma.layers.26.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.278819] Trainable param: llma.layers.26.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 -[19:27:29.278830] Trainable param: llma.layers.26.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.278840] Trainable param: llma.layers.26.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.278851] Trainable param: llma.layers.26.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.278861] Trainable param: llma.layers.26.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 -[19:27:29.278872] Trainable param: llma.layers.26.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.278882] Trainable param: llma.layers.26.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.278893] Trainable param: llma.layers.26.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.278903] Trainable param: llma.layers.26.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 -[19:27:29.278915] Trainable param: llma.layers.26.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.278925] Trainable param: llma.layers.26.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.278938] Trainable param: llma.layers.27.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.278948] Trainable param: llma.layers.27.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 -[19:27:29.278959] Trainable param: llma.layers.27.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.278969] Trainable param: llma.layers.27.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 -[19:27:29.278981] Trainable param: llma.layers.27.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.278990] Trainable param: llma.layers.27.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 -[19:27:29.279002] Trainable param: llma.layers.27.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.279012] Trainable param: llma.layers.27.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.279024] Trainable param: llma.layers.27.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.279034] Trainable param: llma.layers.27.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 -[19:27:29.279045] Trainable param: llma.layers.27.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.279055] Trainable param: llma.layers.27.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.279066] Trainable param: llma.layers.27.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.279076] Trainable param: llma.layers.27.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 -[19:27:29.279087] Trainable param: llma.layers.27.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.279098] Trainable param: llma.layers.27.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.279110] Trainable param: llma.layers.28.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.279120] Trainable param: llma.layers.28.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 -[19:27:29.279131] Trainable param: llma.layers.28.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.279141] Trainable param: llma.layers.28.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 -[19:27:29.279153] Trainable param: llma.layers.28.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.279163] Trainable param: llma.layers.28.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 -[19:27:29.279174] Trainable param: llma.layers.28.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.279184] Trainable param: llma.layers.28.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.279196] Trainable param: llma.layers.28.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.279206] Trainable param: llma.layers.28.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 -[19:27:29.279217] Trainable param: llma.layers.28.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.279227] Trainable param: llma.layers.28.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.279238] Trainable param: llma.layers.28.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.279248] Trainable param: llma.layers.28.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 -[19:27:29.279259] Trainable param: llma.layers.28.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.279270] Trainable param: llma.layers.28.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.279283] Trainable param: llma.layers.29.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.279293] Trainable param: llma.layers.29.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 -[19:27:29.279304] Trainable param: llma.layers.29.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.279314] Trainable param: llma.layers.29.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 -[19:27:29.279325] Trainable param: llma.layers.29.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.279335] Trainable param: llma.layers.29.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 -[19:27:29.279346] Trainable param: llma.layers.29.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.279356] Trainable param: llma.layers.29.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.279368] Trainable param: llma.layers.29.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.279378] Trainable param: llma.layers.29.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 -[19:27:29.279389] Trainable param: llma.layers.29.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.279399] Trainable param: llma.layers.29.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.279410] Trainable param: llma.layers.29.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.279420] Trainable param: llma.layers.29.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 -[19:27:29.279431] Trainable param: llma.layers.29.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.279442] Trainable param: llma.layers.29.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.279455] Trainable param: llma.layers.30.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.279465] Trainable param: llma.layers.30.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 -[19:27:29.279476] Trainable param: llma.layers.30.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.279486] Trainable param: llma.layers.30.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 -[19:27:29.279497] Trainable param: llma.layers.30.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.279507] Trainable param: llma.layers.30.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 -[19:27:29.279518] Trainable param: llma.layers.30.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.279527] Trainable param: llma.layers.30.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.279539] Trainable param: llma.layers.30.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.279549] Trainable param: llma.layers.30.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 -[19:27:29.279561] Trainable param: llma.layers.30.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.279570] Trainable param: llma.layers.30.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.279582] Trainable param: llma.layers.30.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.279591] Trainable param: llma.layers.30.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 -[19:27:29.279603] Trainable param: llma.layers.30.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.279613] Trainable param: llma.layers.30.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.279626] Trainable param: llma.layers.31.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.279636] Trainable param: llma.layers.31.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 -[19:27:29.279648] Trainable param: llma.layers.31.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.279657] Trainable param: llma.layers.31.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 -[19:27:29.279669] Trainable param: llma.layers.31.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.279678] Trainable param: llma.layers.31.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 -[19:27:29.279690] Trainable param: llma.layers.31.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.279700] Trainable param: llma.layers.31.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.279712] Trainable param: llma.layers.31.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.279722] Trainable param: llma.layers.31.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 -[19:27:29.279733] Trainable param: llma.layers.31.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.279743] Trainable param: llma.layers.31.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.279754] Trainable param: llma.layers.31.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.279764] Trainable param: llma.layers.31.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 -[19:27:29.279775] Trainable param: llma.layers.31.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.279786] Trainable param: llma.layers.31.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.279799] Trainable param: llma.layers.32.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.279809] Trainable param: llma.layers.32.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 -[19:27:29.279821] Trainable param: llma.layers.32.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.279830] Trainable param: llma.layers.32.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 -[19:27:29.279842] Trainable param: llma.layers.32.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.279852] Trainable param: llma.layers.32.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 -[19:27:29.279863] Trainable param: llma.layers.32.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.279873] Trainable param: llma.layers.32.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.279885] Trainable param: llma.layers.32.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.279895] Trainable param: llma.layers.32.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 -[19:27:29.279906] Trainable param: llma.layers.32.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.279916] Trainable param: llma.layers.32.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.279927] Trainable param: llma.layers.32.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.279937] Trainable param: llma.layers.32.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 -[19:27:29.279948] Trainable param: llma.layers.32.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.279959] Trainable param: llma.layers.32.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.279972] Trainable param: llma.layers.33.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.279982] Trainable param: llma.layers.33.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 -[19:27:29.279993] Trainable param: llma.layers.33.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.280003] Trainable param: llma.layers.33.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 -[19:27:29.280014] Trainable param: llma.layers.33.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.280024] Trainable param: llma.layers.33.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 -[19:27:29.280035] Trainable param: llma.layers.33.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.280045] Trainable param: llma.layers.33.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.280057] Trainable param: llma.layers.33.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.280067] Trainable param: llma.layers.33.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 -[19:27:29.280079] Trainable param: llma.layers.33.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.280088] Trainable param: llma.layers.33.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.280099] Trainable param: llma.layers.33.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.280109] Trainable param: llma.layers.33.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 -[19:27:29.280120] Trainable param: llma.layers.33.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.280131] Trainable param: llma.layers.33.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.280144] Trainable param: llma.layers.34.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.280154] Trainable param: llma.layers.34.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 -[19:27:29.280165] Trainable param: llma.layers.34.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.280175] Trainable param: llma.layers.34.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 -[19:27:29.280186] Trainable param: llma.layers.34.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.280196] Trainable param: llma.layers.34.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 -[19:27:29.280207] Trainable param: llma.layers.34.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.280217] Trainable param: llma.layers.34.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.280229] Trainable param: llma.layers.34.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.280239] Trainable param: llma.layers.34.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 -[19:27:29.280250] Trainable param: llma.layers.34.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.280260] Trainable param: llma.layers.34.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.280271] Trainable param: llma.layers.34.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.280281] Trainable param: llma.layers.34.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 -[19:27:29.280293] Trainable param: llma.layers.34.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.280304] Trainable param: llma.layers.34.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.280316] Trainable param: llma.layers.35.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.280326] Trainable param: llma.layers.35.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 -[19:27:29.280338] Trainable param: llma.layers.35.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.280348] Trainable param: llma.layers.35.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 -[19:27:29.280359] Trainable param: llma.layers.35.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.280369] Trainable param: llma.layers.35.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 -[19:27:29.280380] Trainable param: llma.layers.35.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.280389] Trainable param: llma.layers.35.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.280401] Trainable param: llma.layers.35.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.280411] Trainable param: llma.layers.35.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 -[19:27:29.280423] Trainable param: llma.layers.35.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.280432] Trainable param: llma.layers.35.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.280443] Trainable param: llma.layers.35.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.280453] Trainable param: llma.layers.35.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 -[19:27:29.280464] Trainable param: llma.layers.35.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.280475] Trainable param: llma.layers.35.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.280488] Trainable param: llma.layers.36.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.280498] Trainable param: llma.layers.36.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 -[19:27:29.280509] Trainable param: llma.layers.36.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.280519] Trainable param: llma.layers.36.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 -[19:27:29.280530] Trainable param: llma.layers.36.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.280540] Trainable param: llma.layers.36.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 -[19:27:29.280551] Trainable param: llma.layers.36.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.280561] Trainable param: llma.layers.36.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.280573] Trainable param: llma.layers.36.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.280583] Trainable param: llma.layers.36.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 -[19:27:29.280594] Trainable param: llma.layers.36.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.280604] Trainable param: llma.layers.36.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.280615] Trainable param: llma.layers.36.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.280625] Trainable param: llma.layers.36.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 -[19:27:29.280636] Trainable param: llma.layers.36.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.280647] Trainable param: llma.layers.36.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.280660] Trainable param: llma.layers.37.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.280669] Trainable param: llma.layers.37.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 -[19:27:29.280681] Trainable param: llma.layers.37.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.280691] Trainable param: llma.layers.37.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 -[19:27:29.280702] Trainable param: llma.layers.37.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.280712] Trainable param: llma.layers.37.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 -[19:27:29.280723] Trainable param: llma.layers.37.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.280733] Trainable param: llma.layers.37.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.280745] Trainable param: llma.layers.37.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.280755] Trainable param: llma.layers.37.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 -[19:27:29.280766] Trainable param: llma.layers.37.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.280776] Trainable param: llma.layers.37.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.280787] Trainable param: llma.layers.37.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.280797] Trainable param: llma.layers.37.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 -[19:27:29.280808] Trainable param: llma.layers.37.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.280818] Trainable param: llma.layers.37.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.280830] Trainable param: llma.layers.38.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.280839] Trainable param: llma.layers.38.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 -[19:27:29.280849] Trainable param: llma.layers.38.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.280858] Trainable param: llma.layers.38.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 -[19:27:29.280868] Trainable param: llma.layers.38.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.280877] Trainable param: llma.layers.38.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 -[19:27:29.280888] Trainable param: llma.layers.38.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.280897] Trainable param: llma.layers.38.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.280908] Trainable param: llma.layers.38.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.280917] Trainable param: llma.layers.38.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 -[19:27:29.280932] Trainable param: llma.layers.38.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.280942] Trainable param: llma.layers.38.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.280952] Trainable param: llma.layers.38.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.280961] Trainable param: llma.layers.38.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 -[19:27:29.280971] Trainable param: llma.layers.38.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.280981] Trainable param: llma.layers.38.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.280993] Trainable param: llma.layers.39.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.281002] Trainable param: llma.layers.39.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 -[19:27:29.281013] Trainable param: llma.layers.39.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.281022] Trainable param: llma.layers.39.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 -[19:27:29.281032] Trainable param: llma.layers.39.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.281041] Trainable param: llma.layers.39.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 -[19:27:29.281051] Trainable param: llma.layers.39.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.281060] Trainable param: llma.layers.39.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.281072] Trainable param: llma.layers.39.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.281081] Trainable param: llma.layers.39.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 -[19:27:29.281093] Trainable param: llma.layers.39.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.281102] Trainable param: llma.layers.39.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.281112] Trainable param: llma.layers.39.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.281121] Trainable param: llma.layers.39.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 -[19:27:29.281131] Trainable param: llma.layers.39.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.281141] Trainable param: llma.layers.39.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.281153] Trainable param: llma.layers.40.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.281162] Trainable param: llma.layers.40.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 -[19:27:29.281172] Trainable param: llma.layers.40.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.281181] Trainable param: llma.layers.40.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 -[19:27:29.281192] Trainable param: llma.layers.40.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.281201] Trainable param: llma.layers.40.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 -[19:27:29.281211] Trainable param: llma.layers.40.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.281220] Trainable param: llma.layers.40.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.281231] Trainable param: llma.layers.40.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.281240] Trainable param: llma.layers.40.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 -[19:27:29.281250] Trainable param: llma.layers.40.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.281259] Trainable param: llma.layers.40.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.281270] Trainable param: llma.layers.40.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.281279] Trainable param: llma.layers.40.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 -[19:27:29.281289] Trainable param: llma.layers.40.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.281299] Trainable param: llma.layers.40.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.281311] Trainable param: llma.layers.41.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.281320] Trainable param: llma.layers.41.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 -[19:27:29.281330] Trainable param: llma.layers.41.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.281339] Trainable param: llma.layers.41.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 -[19:27:29.281349] Trainable param: llma.layers.41.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.281359] Trainable param: llma.layers.41.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 -[19:27:29.281370] Trainable param: llma.layers.41.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.281379] Trainable param: llma.layers.41.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.281390] Trainable param: llma.layers.41.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.281399] Trainable param: llma.layers.41.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 -[19:27:29.281409] Trainable param: llma.layers.41.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.281418] Trainable param: llma.layers.41.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.281428] Trainable param: llma.layers.41.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.281437] Trainable param: llma.layers.41.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 -[19:27:29.281448] Trainable param: llma.layers.41.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.281458] Trainable param: llma.layers.41.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.281470] Trainable param: llma.layers.42.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.281479] Trainable param: llma.layers.42.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 -[19:27:29.281489] Trainable param: llma.layers.42.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.281498] Trainable param: llma.layers.42.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 -[19:27:29.281508] Trainable param: llma.layers.42.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.281521] Trainable param: llma.layers.42.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 -[19:27:29.281531] Trainable param: llma.layers.42.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.281540] Trainable param: llma.layers.42.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.281551] Trainable param: llma.layers.42.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.281560] Trainable param: llma.layers.42.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 -[19:27:29.281571] Trainable param: llma.layers.42.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.281580] Trainable param: llma.layers.42.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.281590] Trainable param: llma.layers.42.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.281599] Trainable param: llma.layers.42.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 -[19:27:29.281609] Trainable param: llma.layers.42.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.281620] Trainable param: llma.layers.42.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.281632] Trainable param: llma.layers.43.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.281641] Trainable param: llma.layers.43.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 -[19:27:29.281652] Trainable param: llma.layers.43.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.281661] Trainable param: llma.layers.43.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 -[19:27:29.281671] Trainable param: llma.layers.43.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.281680] Trainable param: llma.layers.43.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 -[19:27:29.281690] Trainable param: llma.layers.43.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.281699] Trainable param: llma.layers.43.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.281710] Trainable param: llma.layers.43.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.281719] Trainable param: llma.layers.43.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 -[19:27:29.281730] Trainable param: llma.layers.43.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.281739] Trainable param: llma.layers.43.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.281749] Trainable param: llma.layers.43.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.281758] Trainable param: llma.layers.43.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 -[19:27:29.281769] Trainable param: llma.layers.43.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.281779] Trainable param: llma.layers.43.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.281791] Trainable param: llma.layers.44.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.281800] Trainable param: llma.layers.44.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 -[19:27:29.281810] Trainable param: llma.layers.44.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.281819] Trainable param: llma.layers.44.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 -[19:27:29.281829] Trainable param: llma.layers.44.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.281838] Trainable param: llma.layers.44.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 -[19:27:29.281848] Trainable param: llma.layers.44.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.281857] Trainable param: llma.layers.44.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.281869] Trainable param: llma.layers.44.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.281878] Trainable param: llma.layers.44.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 -[19:27:29.281888] Trainable param: llma.layers.44.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.281897] Trainable param: llma.layers.44.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.281908] Trainable param: llma.layers.44.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.281916] Trainable param: llma.layers.44.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 -[19:27:29.281927] Trainable param: llma.layers.44.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.281936] Trainable param: llma.layers.44.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.281948] Trainable param: llma.layers.45.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.281957] Trainable param: llma.layers.45.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 -[19:27:29.281967] Trainable param: llma.layers.45.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.281976] Trainable param: llma.layers.45.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 -[19:27:29.281987] Trainable param: llma.layers.45.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.281996] Trainable param: llma.layers.45.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 -[19:27:29.282006] Trainable param: llma.layers.45.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.282015] Trainable param: llma.layers.45.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.282026] Trainable param: llma.layers.45.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.282035] Trainable param: llma.layers.45.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 -[19:27:29.282045] Trainable param: llma.layers.45.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.282054] Trainable param: llma.layers.45.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.282065] Trainable param: llma.layers.45.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.282073] Trainable param: llma.layers.45.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 -[19:27:29.282084] Trainable param: llma.layers.45.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.282094] Trainable param: llma.layers.45.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.282106] Trainable param: llma.layers.46.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.282115] Trainable param: llma.layers.46.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 -[19:27:29.282125] Trainable param: llma.layers.46.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.282134] Trainable param: llma.layers.46.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 -[19:27:29.282144] Trainable param: llma.layers.46.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.282153] Trainable param: llma.layers.46.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 -[19:27:29.282164] Trainable param: llma.layers.46.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.282173] Trainable param: llma.layers.46.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.282184] Trainable param: llma.layers.46.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.282193] Trainable param: llma.layers.46.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 -[19:27:29.282203] Trainable param: llma.layers.46.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.282212] Trainable param: llma.layers.46.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.282223] Trainable param: llma.layers.46.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.282232] Trainable param: llma.layers.46.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 -[19:27:29.282242] Trainable param: llma.layers.46.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.282253] Trainable param: llma.layers.46.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.282272] Trainable param: llma.layers.47.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.282286] Trainable param: llma.layers.47.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 -[19:27:29.282302] Trainable param: llma.layers.47.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.282316] Trainable param: llma.layers.47.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 -[19:27:29.282333] Trainable param: llma.layers.47.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.282347] Trainable param: llma.layers.47.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 -[19:27:29.282363] Trainable param: llma.layers.47.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.282378] Trainable param: llma.layers.47.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.282396] Trainable param: llma.layers.47.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.282411] Trainable param: llma.layers.47.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 -[19:27:29.282428] Trainable param: llma.layers.47.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.282443] Trainable param: llma.layers.47.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.282460] Trainable param: llma.layers.47.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.282475] Trainable param: llma.layers.47.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 -[19:27:29.282492] Trainable param: llma.layers.47.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.282509] Trainable param: llma.layers.47.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.282530] Trainable param: llma.layers.48.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.282546] Trainable param: llma.layers.48.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 -[19:27:29.282564] Trainable param: llma.layers.48.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.282579] Trainable param: llma.layers.48.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 -[19:27:29.282597] Trainable param: llma.layers.48.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.282613] Trainable param: llma.layers.48.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 -[19:27:29.282630] Trainable param: llma.layers.48.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.282645] Trainable param: llma.layers.48.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.282665] Trainable param: llma.layers.48.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.282681] Trainable param: llma.layers.48.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 -[19:27:29.282699] Trainable param: llma.layers.48.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.282714] Trainable param: llma.layers.48.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.282732] Trainable param: llma.layers.48.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.282747] Trainable param: llma.layers.48.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 -[19:27:29.282766] Trainable param: llma.layers.48.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.282783] Trainable param: llma.layers.48.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.282805] Trainable param: llma.layers.49.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.282821] Trainable param: llma.layers.49.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 -[19:27:29.282839] Trainable param: llma.layers.49.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.282854] Trainable param: llma.layers.49.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 -[19:27:29.282871] Trainable param: llma.layers.49.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.282885] Trainable param: llma.layers.49.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 -[19:27:29.282901] Trainable param: llma.layers.49.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.282916] Trainable param: llma.layers.49.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.282934] Trainable param: llma.layers.49.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.282949] Trainable param: llma.layers.49.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 -[19:27:29.282966] Trainable param: llma.layers.49.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.282980] Trainable param: llma.layers.49.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.282997] Trainable param: llma.layers.49.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.283012] Trainable param: llma.layers.49.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 -[19:27:29.283028] Trainable param: llma.layers.49.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.283043] Trainable param: llma.layers.49.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.283063] Trainable param: llma.layers.50.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.283078] Trainable param: llma.layers.50.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 -[19:27:29.283093] Trainable param: llma.layers.50.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.283107] Trainable param: llma.layers.50.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 -[19:27:29.283123] Trainable param: llma.layers.50.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.283138] Trainable param: llma.layers.50.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 -[19:27:29.283155] Trainable param: llma.layers.50.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.283170] Trainable param: llma.layers.50.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.283188] Trainable param: llma.layers.50.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.283203] Trainable param: llma.layers.50.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 -[19:27:29.283221] Trainable param: llma.layers.50.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.283237] Trainable param: llma.layers.50.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.283254] Trainable param: llma.layers.50.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.283270] Trainable param: llma.layers.50.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 -[19:27:29.283288] Trainable param: llma.layers.50.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.283306] Trainable param: llma.layers.50.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.283327] Trainable param: llma.layers.51.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.283343] Trainable param: llma.layers.51.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 -[19:27:29.283362] Trainable param: llma.layers.51.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.283377] Trainable param: llma.layers.51.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 -[19:27:29.283395] Trainable param: llma.layers.51.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.283410] Trainable param: llma.layers.51.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 -[19:27:29.283428] Trainable param: llma.layers.51.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.283444] Trainable param: llma.layers.51.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.283464] Trainable param: llma.layers.51.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.283479] Trainable param: llma.layers.51.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 -[19:27:29.283498] Trainable param: llma.layers.51.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.283514] Trainable param: llma.layers.51.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.283532] Trainable param: llma.layers.51.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.283548] Trainable param: llma.layers.51.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 -[19:27:29.283566] Trainable param: llma.layers.51.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.283583] Trainable param: llma.layers.51.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.283605] Trainable param: llma.layers.52.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.283621] Trainable param: llma.layers.52.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 -[19:27:29.283639] Trainable param: llma.layers.52.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.283654] Trainable param: llma.layers.52.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 -[19:27:29.283673] Trainable param: llma.layers.52.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.283688] Trainable param: llma.layers.52.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 -[19:27:29.283704] Trainable param: llma.layers.52.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.283720] Trainable param: llma.layers.52.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.283740] Trainable param: llma.layers.52.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.283756] Trainable param: llma.layers.52.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 -[19:27:29.283773] Trainable param: llma.layers.52.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.283789] Trainable param: llma.layers.52.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.283807] Trainable param: llma.layers.52.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.283823] Trainable param: llma.layers.52.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 -[19:27:29.283841] Trainable param: llma.layers.52.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.283859] Trainable param: llma.layers.52.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.283881] Trainable param: llma.layers.53.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.283897] Trainable param: llma.layers.53.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 -[19:27:29.283915] Trainable param: llma.layers.53.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.283931] Trainable param: llma.layers.53.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 -[19:27:29.283949] Trainable param: llma.layers.53.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.283964] Trainable param: llma.layers.53.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 -[19:27:29.283981] Trainable param: llma.layers.53.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.283996] Trainable param: llma.layers.53.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.284014] Trainable param: llma.layers.53.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.284029] Trainable param: llma.layers.53.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 -[19:27:29.284047] Trainable param: llma.layers.53.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.284062] Trainable param: llma.layers.53.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.284079] Trainable param: llma.layers.53.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.284094] Trainable param: llma.layers.53.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 -[19:27:29.284111] Trainable param: llma.layers.53.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.284128] Trainable param: llma.layers.53.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.284153] Trainable param: llma.layers.54.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.284169] Trainable param: llma.layers.54.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 -[19:27:29.284188] Trainable param: llma.layers.54.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.284204] Trainable param: llma.layers.54.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 -[19:27:29.284223] Trainable param: llma.layers.54.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.284240] Trainable param: llma.layers.54.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 -[19:27:29.284258] Trainable param: llma.layers.54.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.284274] Trainable param: llma.layers.54.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.284294] Trainable param: llma.layers.54.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.284310] Trainable param: llma.layers.54.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 -[19:27:29.284328] Trainable param: llma.layers.54.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.284344] Trainable param: llma.layers.54.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.284362] Trainable param: llma.layers.54.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.284378] Trainable param: llma.layers.54.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 -[19:27:29.284396] Trainable param: llma.layers.54.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.284413] Trainable param: llma.layers.54.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.284434] Trainable param: llma.layers.55.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.284449] Trainable param: llma.layers.55.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 -[19:27:29.284467] Trainable param: llma.layers.55.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.284482] Trainable param: llma.layers.55.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 -[19:27:29.284501] Trainable param: llma.layers.55.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.284518] Trainable param: llma.layers.55.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 -[19:27:29.284537] Trainable param: llma.layers.55.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.284555] Trainable param: llma.layers.55.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.284575] Trainable param: llma.layers.55.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.284592] Trainable param: llma.layers.55.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 -[19:27:29.284611] Trainable param: llma.layers.55.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.284628] Trainable param: llma.layers.55.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.284647] Trainable param: llma.layers.55.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.284663] Trainable param: llma.layers.55.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 -[19:27:29.284681] Trainable param: llma.layers.55.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.284701] Trainable param: llma.layers.55.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.284725] Trainable param: llma.layers.56.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.284742] Trainable param: llma.layers.56.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 -[19:27:29.284762] Trainable param: llma.layers.56.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.284779] Trainable param: llma.layers.56.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 -[19:27:29.284797] Trainable param: llma.layers.56.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.284813] Trainable param: llma.layers.56.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 -[19:27:29.284832] Trainable param: llma.layers.56.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.284847] Trainable param: llma.layers.56.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.284868] Trainable param: llma.layers.56.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.284884] Trainable param: llma.layers.56.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 -[19:27:29.284903] Trainable param: llma.layers.56.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.284920] Trainable param: llma.layers.56.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.284939] Trainable param: llma.layers.56.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.284955] Trainable param: llma.layers.56.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 -[19:27:29.284974] Trainable param: llma.layers.56.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.284992] Trainable param: llma.layers.56.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.285015] Trainable param: llma.layers.57.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.285032] Trainable param: llma.layers.57.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 -[19:27:29.285050] Trainable param: llma.layers.57.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.285067] Trainable param: llma.layers.57.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 -[19:27:29.285084] Trainable param: llma.layers.57.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.285100] Trainable param: llma.layers.57.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 -[19:27:29.285117] Trainable param: llma.layers.57.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.285132] Trainable param: llma.layers.57.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.285151] Trainable param: llma.layers.57.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.285166] Trainable param: llma.layers.57.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 -[19:27:29.285184] Trainable param: llma.layers.57.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.285199] Trainable param: llma.layers.57.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.285216] Trainable param: llma.layers.57.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.285231] Trainable param: llma.layers.57.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 -[19:27:29.285247] Trainable param: llma.layers.57.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.285263] Trainable param: llma.layers.57.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.285282] Trainable param: llma.layers.58.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.285297] Trainable param: llma.layers.58.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 -[19:27:29.285314] Trainable param: llma.layers.58.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.285328] Trainable param: llma.layers.58.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 -[19:27:29.285345] Trainable param: llma.layers.58.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.285359] Trainable param: llma.layers.58.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 -[19:27:29.285375] Trainable param: llma.layers.58.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.285389] Trainable param: llma.layers.58.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.285407] Trainable param: llma.layers.58.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.285421] Trainable param: llma.layers.58.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 -[19:27:29.285438] Trainable param: llma.layers.58.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.285452] Trainable param: llma.layers.58.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.285469] Trainable param: llma.layers.58.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.285483] Trainable param: llma.layers.58.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 -[19:27:29.285499] Trainable param: llma.layers.58.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.285523] Trainable param: llma.layers.58.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.285544] Trainable param: llma.layers.59.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.285558] Trainable param: llma.layers.59.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 -[19:27:29.285574] Trainable param: llma.layers.59.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.285589] Trainable param: llma.layers.59.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 -[19:27:29.285607] Trainable param: llma.layers.59.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.285623] Trainable param: llma.layers.59.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 -[19:27:29.285641] Trainable param: llma.layers.59.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.285657] Trainable param: llma.layers.59.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.285677] Trainable param: llma.layers.59.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.285693] Trainable param: llma.layers.59.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 -[19:27:29.285711] Trainable param: llma.layers.59.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.285727] Trainable param: llma.layers.59.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.285745] Trainable param: llma.layers.59.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.285761] Trainable param: llma.layers.59.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 -[19:27:29.285777] Trainable param: llma.layers.59.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.285794] Trainable param: llma.layers.59.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.285815] Trainable param: llma.layers.60.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.285830] Trainable param: llma.layers.60.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 -[19:27:29.285847] Trainable param: llma.layers.60.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.285862] Trainable param: llma.layers.60.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 -[19:27:29.285878] Trainable param: llma.layers.60.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.285893] Trainable param: llma.layers.60.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 -[19:27:29.285910] Trainable param: llma.layers.60.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.285925] Trainable param: llma.layers.60.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.285943] Trainable param: llma.layers.60.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.285957] Trainable param: llma.layers.60.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 -[19:27:29.285974] Trainable param: llma.layers.60.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.285989] Trainable param: llma.layers.60.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.286005] Trainable param: llma.layers.60.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.286019] Trainable param: llma.layers.60.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 -[19:27:29.286036] Trainable param: llma.layers.60.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.286053] Trainable param: llma.layers.60.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.286073] Trainable param: llma.layers.61.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.286087] Trainable param: llma.layers.61.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 -[19:27:29.286104] Trainable param: llma.layers.61.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.286119] Trainable param: llma.layers.61.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 -[19:27:29.286135] Trainable param: llma.layers.61.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.286149] Trainable param: llma.layers.61.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 -[19:27:29.286166] Trainable param: llma.layers.61.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.286181] Trainable param: llma.layers.61.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.286200] Trainable param: llma.layers.61.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.286215] Trainable param: llma.layers.61.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 -[19:27:29.286232] Trainable param: llma.layers.61.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.286247] Trainable param: llma.layers.61.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.286264] Trainable param: llma.layers.61.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.286280] Trainable param: llma.layers.61.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 -[19:27:29.286297] Trainable param: llma.layers.61.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.286314] Trainable param: llma.layers.61.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.286335] Trainable param: llma.layers.62.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.286350] Trainable param: llma.layers.62.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 -[19:27:29.286368] Trainable param: llma.layers.62.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.286383] Trainable param: llma.layers.62.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 -[19:27:29.286401] Trainable param: llma.layers.62.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.286417] Trainable param: llma.layers.62.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 -[19:27:29.286436] Trainable param: llma.layers.62.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.286451] Trainable param: llma.layers.62.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.286470] Trainable param: llma.layers.62.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.286486] Trainable param: llma.layers.62.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 -[19:27:29.286503] Trainable param: llma.layers.62.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.286519] Trainable param: llma.layers.62.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.286536] Trainable param: llma.layers.62.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.286550] Trainable param: llma.layers.62.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 -[19:27:29.286567] Trainable param: llma.layers.62.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.286584] Trainable param: llma.layers.62.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.286605] Trainable param: llma.layers.63.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.286620] Trainable param: llma.layers.63.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 -[19:27:29.286638] Trainable param: llma.layers.63.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.286653] Trainable param: llma.layers.63.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 -[19:27:29.286671] Trainable param: llma.layers.63.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.286686] Trainable param: llma.layers.63.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 -[19:27:29.286704] Trainable param: llma.layers.63.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.286720] Trainable param: llma.layers.63.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.286739] Trainable param: llma.layers.63.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.286753] Trainable param: llma.layers.63.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 -[19:27:29.286769] Trainable param: llma.layers.63.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.286783] Trainable param: llma.layers.63.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.286801] Trainable param: llma.layers.63.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.286815] Trainable param: llma.layers.63.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 -[19:27:29.286833] Trainable param: llma.layers.63.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.286851] Trainable param: llma.layers.63.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.286874] Trainable param: llma.layers.64.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.286889] Trainable param: llma.layers.64.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 -[19:27:29.286907] Trainable param: llma.layers.64.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.286922] Trainable param: llma.layers.64.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 -[19:27:29.286939] Trainable param: llma.layers.64.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.286954] Trainable param: llma.layers.64.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 -[19:27:29.286971] Trainable param: llma.layers.64.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.286985] Trainable param: llma.layers.64.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.287004] Trainable param: llma.layers.64.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.287019] Trainable param: llma.layers.64.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 -[19:27:29.287036] Trainable param: llma.layers.64.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.287051] Trainable param: llma.layers.64.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.287067] Trainable param: llma.layers.64.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.287082] Trainable param: llma.layers.64.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 -[19:27:29.287099] Trainable param: llma.layers.64.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.287115] Trainable param: llma.layers.64.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.287136] Trainable param: llma.layers.65.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.287151] Trainable param: llma.layers.65.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 -[19:27:29.287168] Trainable param: llma.layers.65.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.287183] Trainable param: llma.layers.65.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 -[19:27:29.287199] Trainable param: llma.layers.65.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.287214] Trainable param: llma.layers.65.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 -[19:27:29.287230] Trainable param: llma.layers.65.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.287245] Trainable param: llma.layers.65.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.287263] Trainable param: llma.layers.65.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.287277] Trainable param: llma.layers.65.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 -[19:27:29.287293] Trainable param: llma.layers.65.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.287308] Trainable param: llma.layers.65.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.287324] Trainable param: llma.layers.65.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.287338] Trainable param: llma.layers.65.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 -[19:27:29.287354] Trainable param: llma.layers.65.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.287369] Trainable param: llma.layers.65.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.287389] Trainable param: llma.layers.66.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.287404] Trainable param: llma.layers.66.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 -[19:27:29.287421] Trainable param: llma.layers.66.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.287435] Trainable param: llma.layers.66.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 -[19:27:29.287452] Trainable param: llma.layers.66.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.287467] Trainable param: llma.layers.66.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 -[19:27:29.287484] Trainable param: llma.layers.66.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.287499] Trainable param: llma.layers.66.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.287518] Trainable param: llma.layers.66.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.287532] Trainable param: llma.layers.66.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 -[19:27:29.287549] Trainable param: llma.layers.66.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.287564] Trainable param: llma.layers.66.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.287581] Trainable param: llma.layers.66.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.287596] Trainable param: llma.layers.66.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 -[19:27:29.287613] Trainable param: llma.layers.66.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.287629] Trainable param: llma.layers.66.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.287649] Trainable param: llma.layers.67.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.287663] Trainable param: llma.layers.67.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 -[19:27:29.287680] Trainable param: llma.layers.67.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.287695] Trainable param: llma.layers.67.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 -[19:27:29.287711] Trainable param: llma.layers.67.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.287725] Trainable param: llma.layers.67.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 -[19:27:29.287742] Trainable param: llma.layers.67.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.287757] Trainable param: llma.layers.67.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.287775] Trainable param: llma.layers.67.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.287788] Trainable param: llma.layers.67.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 -[19:27:29.287804] Trainable param: llma.layers.67.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.287819] Trainable param: llma.layers.67.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.287838] Trainable param: llma.layers.67.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.287854] Trainable param: llma.layers.67.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 -[19:27:29.287873] Trainable param: llma.layers.67.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.287892] Trainable param: llma.layers.67.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.287914] Trainable param: llma.layers.68.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.287931] Trainable param: llma.layers.68.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 -[19:27:29.287950] Trainable param: llma.layers.68.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.287965] Trainable param: llma.layers.68.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 -[19:27:29.287984] Trainable param: llma.layers.68.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.288000] Trainable param: llma.layers.68.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 -[19:27:29.288018] Trainable param: llma.layers.68.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.288033] Trainable param: llma.layers.68.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.288053] Trainable param: llma.layers.68.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.288069] Trainable param: llma.layers.68.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 -[19:27:29.288088] Trainable param: llma.layers.68.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.288104] Trainable param: llma.layers.68.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.288122] Trainable param: llma.layers.68.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.288138] Trainable param: llma.layers.68.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 -[19:27:29.288156] Trainable param: llma.layers.68.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.288173] Trainable param: llma.layers.68.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.288194] Trainable param: llma.layers.69.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.288210] Trainable param: llma.layers.69.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 -[19:27:29.288226] Trainable param: llma.layers.69.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.288241] Trainable param: llma.layers.69.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 -[19:27:29.288258] Trainable param: llma.layers.69.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.288272] Trainable param: llma.layers.69.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 -[19:27:29.288289] Trainable param: llma.layers.69.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.288303] Trainable param: llma.layers.69.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.288322] Trainable param: llma.layers.69.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.288337] Trainable param: llma.layers.69.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 -[19:27:29.288355] Trainable param: llma.layers.69.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.288369] Trainable param: llma.layers.69.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.288387] Trainable param: llma.layers.69.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.288402] Trainable param: llma.layers.69.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 -[19:27:29.288419] Trainable param: llma.layers.69.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.288436] Trainable param: llma.layers.69.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.288457] Trainable param: llma.layers.70.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.288472] Trainable param: llma.layers.70.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 -[19:27:29.288489] Trainable param: llma.layers.70.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.288504] Trainable param: llma.layers.70.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 -[19:27:29.288521] Trainable param: llma.layers.70.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.288535] Trainable param: llma.layers.70.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 -[19:27:29.288551] Trainable param: llma.layers.70.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.288565] Trainable param: llma.layers.70.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.288583] Trainable param: llma.layers.70.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.288598] Trainable param: llma.layers.70.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 -[19:27:29.288615] Trainable param: llma.layers.70.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.288630] Trainable param: llma.layers.70.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.288649] Trainable param: llma.layers.70.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.288664] Trainable param: llma.layers.70.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 -[19:27:29.288681] Trainable param: llma.layers.70.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.288699] Trainable param: llma.layers.70.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.288719] Trainable param: llma.layers.71.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.288735] Trainable param: llma.layers.71.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 -[19:27:29.288752] Trainable param: llma.layers.71.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.288767] Trainable param: llma.layers.71.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 -[19:27:29.288785] Trainable param: llma.layers.71.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.288800] Trainable param: llma.layers.71.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 -[19:27:29.288818] Trainable param: llma.layers.71.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.288834] Trainable param: llma.layers.71.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.288853] Trainable param: llma.layers.71.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.288868] Trainable param: llma.layers.71.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 -[19:27:29.288886] Trainable param: llma.layers.71.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.288901] Trainable param: llma.layers.71.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.288919] Trainable param: llma.layers.71.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.288934] Trainable param: llma.layers.71.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 -[19:27:29.288951] Trainable param: llma.layers.71.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.288967] Trainable param: llma.layers.71.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.288989] Trainable param: llma.layers.72.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.289005] Trainable param: llma.layers.72.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 -[19:27:29.289023] Trainable param: llma.layers.72.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.289038] Trainable param: llma.layers.72.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 -[19:27:29.289056] Trainable param: llma.layers.72.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.289071] Trainable param: llma.layers.72.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 -[19:27:29.289088] Trainable param: llma.layers.72.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.289104] Trainable param: llma.layers.72.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.289124] Trainable param: llma.layers.72.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.289139] Trainable param: llma.layers.72.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 -[19:27:29.289157] Trainable param: llma.layers.72.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.289172] Trainable param: llma.layers.72.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.289190] Trainable param: llma.layers.72.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.289205] Trainable param: llma.layers.72.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 -[19:27:29.289222] Trainable param: llma.layers.72.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.289238] Trainable param: llma.layers.72.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.289259] Trainable param: llma.layers.73.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.289274] Trainable param: llma.layers.73.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 -[19:27:29.289292] Trainable param: llma.layers.73.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.289307] Trainable param: llma.layers.73.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 -[19:27:29.289324] Trainable param: llma.layers.73.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.289339] Trainable param: llma.layers.73.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 -[19:27:29.289356] Trainable param: llma.layers.73.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.289370] Trainable param: llma.layers.73.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.289389] Trainable param: llma.layers.73.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.289404] Trainable param: llma.layers.73.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 -[19:27:29.289421] Trainable param: llma.layers.73.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.289436] Trainable param: llma.layers.73.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.289452] Trainable param: llma.layers.73.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.289467] Trainable param: llma.layers.73.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 -[19:27:29.289483] Trainable param: llma.layers.73.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.289499] Trainable param: llma.layers.73.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.289525] Trainable param: llma.layers.74.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.289541] Trainable param: llma.layers.74.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 -[19:27:29.289559] Trainable param: llma.layers.74.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.289573] Trainable param: llma.layers.74.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 -[19:27:29.289590] Trainable param: llma.layers.74.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.289604] Trainable param: llma.layers.74.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 -[19:27:29.289621] Trainable param: llma.layers.74.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.289635] Trainable param: llma.layers.74.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.289653] Trainable param: llma.layers.74.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.289667] Trainable param: llma.layers.74.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 -[19:27:29.289684] Trainable param: llma.layers.74.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.289699] Trainable param: llma.layers.74.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.289715] Trainable param: llma.layers.74.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.289729] Trainable param: llma.layers.74.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 -[19:27:29.289745] Trainable param: llma.layers.74.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.289761] Trainable param: llma.layers.74.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.289782] Trainable param: llma.layers.75.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.289797] Trainable param: llma.layers.75.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 -[19:27:29.289815] Trainable param: llma.layers.75.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.289830] Trainable param: llma.layers.75.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 -[19:27:29.289847] Trainable param: llma.layers.75.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.289862] Trainable param: llma.layers.75.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 -[19:27:29.289878] Trainable param: llma.layers.75.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.289892] Trainable param: llma.layers.75.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.289909] Trainable param: llma.layers.75.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.289924] Trainable param: llma.layers.75.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 -[19:27:29.289942] Trainable param: llma.layers.75.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.289957] Trainable param: llma.layers.75.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.289975] Trainable param: llma.layers.75.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.289991] Trainable param: llma.layers.75.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 -[19:27:29.290009] Trainable param: llma.layers.75.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.290026] Trainable param: llma.layers.75.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.290048] Trainable param: llma.layers.76.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.290063] Trainable param: llma.layers.76.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 -[19:27:29.290081] Trainable param: llma.layers.76.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.290097] Trainable param: llma.layers.76.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 -[19:27:29.290115] Trainable param: llma.layers.76.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.290130] Trainable param: llma.layers.76.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 -[19:27:29.290148] Trainable param: llma.layers.76.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.290163] Trainable param: llma.layers.76.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.290183] Trainable param: llma.layers.76.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.290199] Trainable param: llma.layers.76.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 -[19:27:29.290217] Trainable param: llma.layers.76.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.290255] Trainable param: llma.layers.76.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.290273] Trainable param: llma.layers.76.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.290287] Trainable param: llma.layers.76.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 -[19:27:29.290301] Trainable param: llma.layers.76.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.290311] Trainable param: llma.layers.76.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.290324] Trainable param: llma.layers.77.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.290334] Trainable param: llma.layers.77.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 -[19:27:29.290345] Trainable param: llma.layers.77.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.290354] Trainable param: llma.layers.77.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 -[19:27:29.290364] Trainable param: llma.layers.77.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.290373] Trainable param: llma.layers.77.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 -[19:27:29.290384] Trainable param: llma.layers.77.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.290393] Trainable param: llma.layers.77.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.290405] Trainable param: llma.layers.77.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.290414] Trainable param: llma.layers.77.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 -[19:27:29.290425] Trainable param: llma.layers.77.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.290434] Trainable param: llma.layers.77.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.290444] Trainable param: llma.layers.77.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.290453] Trainable param: llma.layers.77.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 -[19:27:29.290463] Trainable param: llma.layers.77.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.290473] Trainable param: llma.layers.77.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.290485] Trainable param: llma.layers.78.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.290495] Trainable param: llma.layers.78.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 -[19:27:29.290506] Trainable param: llma.layers.78.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.290515] Trainable param: llma.layers.78.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 -[19:27:29.290525] Trainable param: llma.layers.78.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.290534] Trainable param: llma.layers.78.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 -[19:27:29.290545] Trainable param: llma.layers.78.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.290554] Trainable param: llma.layers.78.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.290565] Trainable param: llma.layers.78.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.290574] Trainable param: llma.layers.78.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 -[19:27:29.290584] Trainable param: llma.layers.78.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.290593] Trainable param: llma.layers.78.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.290603] Trainable param: llma.layers.78.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.290612] Trainable param: llma.layers.78.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 -[19:27:29.290623] Trainable param: llma.layers.78.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.290633] Trainable param: llma.layers.78.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.290645] Trainable param: llma.layers.79.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.290654] Trainable param: llma.layers.79.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 -[19:27:29.290665] Trainable param: llma.layers.79.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.290675] Trainable param: llma.layers.79.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 -[19:27:29.290686] Trainable param: llma.layers.79.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.290695] Trainable param: llma.layers.79.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 -[19:27:29.290705] Trainable param: llma.layers.79.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.290714] Trainable param: llma.layers.79.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.290725] Trainable param: llma.layers.79.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.290734] Trainable param: llma.layers.79.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 -[19:27:29.290745] Trainable param: llma.layers.79.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.290754] Trainable param: llma.layers.79.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.290764] Trainable param: llma.layers.79.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.290773] Trainable param: llma.layers.79.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 -[19:27:29.290783] Trainable param: llma.layers.79.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.290793] Trainable param: llma.layers.79.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.290804] Trainable param: llma.norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 -[19:27:29.290815] Trainable param: llma.output.weight, local_size: torch.Size([32000, 8192]), model_parallel: True, dtype: torch.bfloat16 -[19:27:29.290840] load pretrained from ../checkpoints/llama2/Llama-2-70b/ -[19:27:29.290845] Quantizing model to 4bit! -[19:28:10.394107] Epoch: [0] [1340/6500] lr: 0.000010 closs: 0.8034 (0.8657) grad_norm: 0.5445 (0.8165) time: 5.5771 data: 0.0002 max mem: 71357 -[19:29:06.239565] Epoch: [0] [1350/6500] lr: 0.000010 closs: 0.8034 (0.8652) grad_norm: 0.5277 (0.8142) time: 5.5869 data: 0.0002 max mem: 71357 -[19:30:01.961975] Epoch: [0] [1360/6500] lr: 0.000010 closs: 0.7740 (0.8646) grad_norm: 0.5703 (0.8128) time: 5.5783 data: 0.0002 max mem: 71357 -[19:30:57.763309] Epoch: [0] [1370/6500] lr: 0.000011 closs: 0.7126 (0.8635) grad_norm: 0.5703 (0.8116) time: 5.5761 data: 0.0002 max mem: 71357 -[19:31:53.564694] Epoch: [0] [1380/6500] lr: 0.000011 closs: 0.7537 (0.8634) grad_norm: 0.5703 (0.8103) time: 5.5800 data: 0.0001 max mem: 71357 -[19:32:49.286247] Epoch: [0] [1390/6500] lr: 0.000011 closs: 0.7631 (0.8624) grad_norm: 0.6007 (0.8092) time: 5.5761 data: 0.0001 max mem: 71357 -[19:33:45.056994] Epoch: [0] [1400/6500] lr: 0.000011 closs: 0.8355 (0.8627) grad_norm: 0.5855 (0.8086) time: 5.5745 data: 0.0001 max mem: 71357 -[19:34:40.796559] Epoch: [0] [1410/6500] lr: 0.000011 closs: 0.8545 (0.8626) grad_norm: 0.5738 (0.8066) time: 5.5754 data: 0.0002 max mem: 71357 -[19:35:36.566183] Epoch: [0] [1420/6500] lr: 0.000011 closs: 0.7831 (0.8623) grad_norm: 0.4588 (0.8040) time: 5.5753 data: 0.0002 max mem: 71357 -[19:36:32.355594] Epoch: [0] [1430/6500] lr: 0.000011 closs: 0.8227 (0.8622) grad_norm: 0.5588 (0.8042) time: 5.5778 data: 0.0001 max mem: 71357 -[19:37:28.095893] Epoch: [0] [1440/6500] lr: 0.000011 closs: 0.8106 (0.8617) grad_norm: 0.5302 (0.8031) time: 5.5764 data: 0.0001 max mem: 71357 -[19:38:23.857545] Epoch: [0] [1450/6500] lr: 0.000011 closs: 0.7735 (0.8613) grad_norm: 0.5302 (0.8011) time: 5.5750 data: 0.0001 max mem: 71357 -[19:39:19.647799] Epoch: [0] [1460/6500] lr: 0.000011 closs: 0.8252 (0.8612) grad_norm: 0.5502 (0.8047) time: 5.5775 data: 0.0001 max mem: 71357 -[19:40:15.482957] Epoch: [0] [1470/6500] lr: 0.000011 closs: 0.8374 (0.8608) grad_norm: 0.5301 (0.8027) time: 5.5811 data: 0.0002 max mem: 71357 -[19:41:11.141116] Epoch: [0] [1480/6500] lr: 0.000011 closs: 0.8001 (0.8606) grad_norm: 0.5333 (0.8027) time: 5.5745 data: 0.0002 max mem: 71357 -[19:42:06.984066] Epoch: [0] [1490/6500] lr: 0.000011 closs: 0.7398 (0.8594) grad_norm: 0.5333 (0.8013) time: 5.5749 data: 0.0001 max mem: 71357 -[19:43:02.746693] Epoch: [0] [1500/6500] lr: 0.000012 closs: 0.7381 (0.8590) grad_norm: 0.5309 (0.8012) time: 5.5802 data: 0.0002 max mem: 71357 -[19:43:58.570975] Epoch: [0] [1510/6500] lr: 0.000012 closs: 0.8013 (0.8588) grad_norm: 0.4857 (0.7988) time: 5.5793 data: 0.0001 max mem: 71357 -[19:44:54.319260] Epoch: [0] [1520/6500] lr: 0.000012 closs: 0.8824 (0.8588) grad_norm: 0.5054 (0.7980) time: 5.5786 data: 0.0002 max mem: 71357 -[19:45:50.069233] Epoch: [0] [1530/6500] lr: 0.000012 closs: 0.8063 (0.8585) grad_norm: 0.4937 (0.7963) time: 5.5748 data: 0.0001 max mem: 71357 -[19:46:45.742778] Epoch: [0] [1540/6500] lr: 0.000012 closs: 0.7936 (0.8579) grad_norm: 0.5091 (0.7950) time: 5.5711 data: 0.0001 max mem: 71357 -[19:47:41.512558] Epoch: [0] [1550/6500] lr: 0.000012 closs: 0.7926 (0.8576) grad_norm: 0.5252 (0.7939) time: 5.5720 data: 0.0001 max mem: 71357 -[19:48:37.387361] Epoch: [0] [1560/6500] lr: 0.000012 closs: 0.7592 (0.8569) grad_norm: 0.4907 (0.7924) time: 5.5821 data: 0.0001 max mem: 71357 -[19:49:33.103590] Epoch: [0] [1570/6500] lr: 0.000012 closs: 0.7427 (0.8565) grad_norm: 0.5252 (0.7923) time: 5.5794 data: 0.0001 max mem: 71357 -[19:50:28.953920] Epoch: [0] [1580/6500] lr: 0.000012 closs: 0.7489 (0.8559) grad_norm: 0.4674 (0.7904) time: 5.5783 data: 0.0001 max mem: 71357 -[19:51:24.796572] Epoch: [0] [1590/6500] lr: 0.000012 closs: 0.7778 (0.8552) grad_norm: 0.4821 (0.7886) time: 5.5845 data: 0.0002 max mem: 71357 -[19:52:20.573169] Epoch: [0] [1600/6500] lr: 0.000012 closs: 0.8162 (0.8553) grad_norm: 0.5151 (0.7882) time: 5.5808 data: 0.0002 max mem: 71357 -[19:53:16.418950] Epoch: [0] [1610/6500] lr: 0.000012 closs: 0.8162 (0.8549) grad_norm: 0.5045 (0.7865) time: 5.5810 data: 0.0001 max mem: 71357 -[19:54:12.129422] Epoch: [0] [1620/6500] lr: 0.000012 closs: 0.7935 (0.8548) grad_norm: 0.5045 (0.7844) time: 5.5777 data: 0.0001 max mem: 71357 -[19:55:07.940492] Epoch: [0] [1630/6500] lr: 0.000013 closs: 0.8191 (0.8547) grad_norm: 0.5045 (0.7841) time: 5.5760 data: 0.0001 max mem: 71357 -[19:56:03.688378] Epoch: [0] [1640/6500] lr: 0.000013 closs: 0.8281 (0.8546) grad_norm: 0.4828 (0.7837) time: 5.5779 data: 0.0001 max mem: 71357 -[19:56:59.568279] Epoch: [0] [1650/6500] lr: 0.000013 closs: 0.7958 (0.8542) grad_norm: 0.4776 (0.7821) time: 5.5813 data: 0.0001 max mem: 71357 -[19:57:55.339672] Epoch: [0] [1660/6500] lr: 0.000013 closs: 0.7472 (0.8538) grad_norm: 0.6027 (0.7811) time: 5.5825 data: 0.0002 max mem: 71357 -[19:58:50.972681] Epoch: [0] [1670/6500] lr: 0.000013 closs: 0.7411 (0.8532) grad_norm: 0.5621 (0.7802) time: 5.5701 data: 0.0002 max mem: 71357 -[19:59:46.690981] Epoch: [0] [1680/6500] lr: 0.000013 closs: 0.7657 (0.8525) grad_norm: 0.5570 (0.7789) time: 5.5674 data: 0.0002 max mem: 71357 -[20:00:42.465398] Epoch: [0] [1690/6500] lr: 0.000013 closs: 0.6785 (0.8513) grad_norm: 0.5621 (0.7781) time: 5.5745 data: 0.0002 max mem: 71357 -[20:01:38.307864] Epoch: [0] [1700/6500] lr: 0.000013 closs: 0.6685 (0.8507) grad_norm: 0.5087 (0.7765) time: 5.5808 data: 0.0001 max mem: 71357 -[20:02:34.069065] Epoch: [0] [1710/6500] lr: 0.000013 closs: 0.7515 (0.8502) grad_norm: 0.5087 (0.7756) time: 5.5801 data: 0.0001 max mem: 71357 -[20:03:29.815804] Epoch: [0] [1720/6500] lr: 0.000013 closs: 0.7740 (0.8501) grad_norm: 0.5406 (0.7768) time: 5.5753 data: 0.0001 max mem: 71357 -[20:04:25.658015] Epoch: [0] [1730/6500] lr: 0.000013 closs: 0.7883 (0.8499) grad_norm: 0.5054 (0.7752) time: 5.5793 data: 0.0001 max mem: 71357 -[20:05:21.448684] Epoch: [0] [1740/6500] lr: 0.000013 closs: 0.7883 (0.8497) grad_norm: 0.5406 (0.7747) time: 5.5815 data: 0.0002 max mem: 71357 -[20:06:17.132635] Epoch: [0] [1750/6500] lr: 0.000013 closs: 0.7562 (0.8491) grad_norm: 0.5160 (0.7736) time: 5.5737 data: 0.0002 max mem: 71357 -[20:07:12.915410] Epoch: [0] [1760/6500] lr: 0.000014 closs: 0.8120 (0.8491) grad_norm: 0.5160 (0.7725) time: 5.5733 data: 0.0001 max mem: 71357 -[20:08:08.564010] Epoch: [0] [1770/6500] lr: 0.000014 closs: 0.9002 (0.8495) grad_norm: 0.6313 (0.7726) time: 5.5715 data: 0.0001 max mem: 71357 -[20:09:04.351531] Epoch: [0] [1780/6500] lr: 0.000014 closs: 0.9002 (0.8491) grad_norm: 0.5656 (0.7711) time: 5.5717 data: 0.0001 max mem: 71357 -[20:10:00.035219] Epoch: [0] [1790/6500] lr: 0.000014 closs: 0.7180 (0.8488) grad_norm: 0.6461 (0.7707) time: 5.5734 data: 0.0002 max mem: 71357 -[20:10:55.870427] Epoch: [0] [1800/6500] lr: 0.000014 closs: 0.7484 (0.8481) grad_norm: 0.5875 (0.7696) time: 5.5758 data: 0.0002 max mem: 71357 -[20:11:51.589061] Epoch: [0] [1810/6500] lr: 0.000014 closs: 0.7187 (0.8474) grad_norm: 0.5284 (0.7691) time: 5.5776 data: 0.0001 max mem: 71357 -[20:12:47.419446] Epoch: [0] [1820/6500] lr: 0.000014 closs: 0.7622 (0.8475) grad_norm: 0.5546 (0.7679) time: 5.5774 data: 0.0002 max mem: 71357 -[20:13:43.152888] Epoch: [0] [1830/6500] lr: 0.000014 closs: 0.8333 (0.8472) grad_norm: 0.5013 (0.7662) time: 5.5781 data: 0.0002 max mem: 71357 -[20:14:38.893836] Epoch: [0] [1840/6500] lr: 0.000014 closs: 0.8029 (0.8472) grad_norm: 0.5110 (0.7648) time: 5.5736 data: 0.0002 max mem: 71357 -[20:15:34.630747] Epoch: [0] [1850/6500] lr: 0.000014 closs: 0.8166 (0.8470) grad_norm: 0.4765 (0.7632) time: 5.5738 data: 0.0001 max mem: 71357 -[20:16:30.314557] Epoch: [0] [1860/6500] lr: 0.000014 closs: 0.8166 (0.8469) grad_norm: 0.4707 (0.7628) time: 5.5709 data: 0.0001 max mem: 71357 -[20:17:26.194852] Epoch: [0] [1870/6500] lr: 0.000014 closs: 0.8289 (0.8467) grad_norm: 0.4765 (0.7619) time: 5.5781 data: 0.0001 max mem: 71357 -[20:18:21.871034] Epoch: [0] [1880/6500] lr: 0.000014 closs: 0.7967 (0.8464) grad_norm: 0.5001 (0.7617) time: 5.5778 data: 0.0001 max mem: 71357 -[20:19:17.591186] Epoch: [0] [1890/6500] lr: 0.000015 closs: 0.7326 (0.8458) grad_norm: 0.6081 (0.7608) time: 5.5697 data: 0.0001 max mem: 71357 -[20:20:13.289294] Epoch: [0] [1900/6500] lr: 0.000015 closs: 0.7281 (0.8450) grad_norm: 0.5038 (0.7597) time: 5.5708 data: 0.0002 max mem: 71357 -[20:21:09.177977] Epoch: [0] [1910/6500] lr: 0.000015 closs: 0.7328 (0.8453) grad_norm: 0.5284 (0.7604) time: 5.5792 data: 0.0002 max mem: 71357 -[20:22:04.997084] Epoch: [0] [1920/6500] lr: 0.000015 closs: 0.8228 (0.8450) grad_norm: 0.5284 (0.7592) time: 5.5853 data: 0.0001 max mem: 71357 -[20:23:00.755724] Epoch: [0] [1930/6500] lr: 0.000015 closs: 0.8124 (0.8447) grad_norm: 0.5513 (0.7585) time: 5.5788 data: 0.0001 max mem: 71357 -[20:23:56.446255] Epoch: [0] [1940/6500] lr: 0.000015 closs: 0.8124 (0.8448) grad_norm: 0.5656 (0.7573) time: 5.5723 data: 0.0001 max mem: 71357 -[20:24:52.260103] Epoch: [0] [1950/6500] lr: 0.000015 closs: 0.7912 (0.8445) grad_norm: 0.5052 (0.7558) time: 5.5751 data: 0.0001 max mem: 71357 -[20:25:47.913210] Epoch: [0] [1960/6500] lr: 0.000015 closs: 0.7160 (0.8440) grad_norm: 0.5052 (0.7555) time: 5.5733 data: 0.0002 max mem: 71357 -[20:26:43.623002] Epoch: [0] [1970/6500] lr: 0.000015 closs: 0.7569 (0.8438) grad_norm: 0.5321 (0.7545) time: 5.5681 data: 0.0002 max mem: 71357 -[20:27:39.403420] Epoch: [0] [1980/6500] lr: 0.000015 closs: 0.8105 (0.8436) grad_norm: 0.5093 (0.7539) time: 5.5744 data: 0.0002 max mem: 71357 -[20:28:35.232297] Epoch: [0] [1990/6500] lr: 0.000015 closs: 0.7956 (0.8437) grad_norm: 0.5200 (0.7528) time: 5.5804 data: 0.0001 max mem: 71357 -[20:29:30.942620] Epoch: [0] [2000/6500] lr: 0.000015 closs: 0.7954 (0.8438) grad_norm: 0.5200 (0.7519) time: 5.5769 data: 0.0001 max mem: 71357 -[20:30:26.631297] Epoch: [0] [2010/6500] lr: 0.000015 closs: 0.7865 (0.8435) grad_norm: 0.4947 (0.7508) time: 5.5698 data: 0.0001 max mem: 71357 -[20:31:22.375631] Epoch: [0] [2020/6500] lr: 0.000016 closs: 0.7190 (0.8429) grad_norm: 0.4947 (0.7500) time: 5.5715 data: 0.0002 max mem: 71357 -[20:32:18.160803] Epoch: [0] [2030/6500] lr: 0.000016 closs: 0.6794 (0.8426) grad_norm: 0.5117 (0.7508) time: 5.5763 data: 0.0002 max mem: 71357 -[20:33:14.032422] Epoch: [0] [2040/6500] lr: 0.000016 closs: 0.6984 (0.8418) grad_norm: 0.5095 (0.7495) time: 5.5827 data: 0.0001 max mem: 71357 -[20:34:09.674229] Epoch: [0] [2050/6500] lr: 0.000016 closs: 0.7023 (0.8413) grad_norm: 0.5421 (0.7498) time: 5.5756 data: 0.0001 max mem: 71357 -[20:35:05.378186] Epoch: [0] [2060/6500] lr: 0.000016 closs: 0.7354 (0.8410) grad_norm: 0.6405 (0.7500) time: 5.5672 data: 0.0002 max mem: 71357 -[20:36:01.171645] Epoch: [0] [2070/6500] lr: 0.000016 closs: 0.7757 (0.8410) grad_norm: 0.5421 (0.7489) time: 5.5747 data: 0.0002 max mem: 71357 -[20:36:57.047870] Epoch: [0] [2080/6500] lr: 0.000016 closs: 0.8422 (0.8410) grad_norm: 0.5457 (0.7477) time: 5.5834 data: 0.0002 max mem: 71357 -[20:37:52.852566] Epoch: [0] [2090/6500] lr: 0.000016 closs: 0.7849 (0.8408) grad_norm: 0.5457 (0.7474) time: 5.5840 data: 0.0001 max mem: 71357 -[20:38:48.604842] Epoch: [0] [2100/6500] lr: 0.000016 closs: 0.7323 (0.8402) grad_norm: 0.4892 (0.7468) time: 5.5778 data: 0.0001 max mem: 71357 -[20:39:44.382964] Epoch: [0] [2110/6500] lr: 0.000016 closs: 0.7806 (0.8405) grad_norm: 0.4828 (0.7459) time: 5.5764 data: 0.0002 max mem: 71357 -[20:40:40.183395] Epoch: [0] [2120/6500] lr: 0.000016 closs: 0.8687 (0.8406) grad_norm: 0.4828 (0.7446) time: 5.5788 data: 0.0001 max mem: 71357 -[20:41:35.950265] Epoch: [0] [2130/6500] lr: 0.000016 closs: 0.7655 (0.8401) grad_norm: 0.4828 (0.7435) time: 5.5782 data: 0.0001 max mem: 71357 -[20:42:31.699836] Epoch: [0] [2140/6500] lr: 0.000016 closs: 0.7670 (0.8399) grad_norm: 0.4762 (0.7425) time: 5.5757 data: 0.0001 max mem: 71357 -[20:43:27.317712] Epoch: [0] [2150/6500] lr: 0.000017 closs: 0.7810 (0.8397) grad_norm: 0.5027 (0.7424) time: 5.5683 data: 0.0001 max mem: 71357 -[20:44:23.007754] Epoch: [0] [2160/6500] lr: 0.000017 closs: 0.7664 (0.8397) grad_norm: 0.5038 (0.7418) time: 5.5653 data: 0.0001 max mem: 71357 -[20:45:18.802104] Epoch: [0] [2170/6500] lr: 0.000017 closs: 0.7484 (0.8394) grad_norm: 0.5949 (0.7415) time: 5.5741 data: 0.0002 max mem: 71357 -[20:46:14.517863] Epoch: [0] [2180/6500] lr: 0.000017 closs: 0.7521 (0.8392) grad_norm: 0.5192 (0.7403) time: 5.5754 data: 0.0002 max mem: 71357 -[20:47:10.200594] Epoch: [0] [2190/6500] lr: 0.000017 closs: 0.7133 (0.8389) grad_norm: 0.5528 (0.7399) time: 5.5698 data: 0.0001 max mem: 71357 -[20:48:05.878498] Epoch: [0] [2200/6500] lr: 0.000017 closs: 0.7301 (0.8386) grad_norm: 0.5528 (0.7390) time: 5.5679 data: 0.0002 max mem: 71357 -[20:49:01.610047] Epoch: [0] [2210/6500] lr: 0.000017 closs: 0.7366 (0.8381) grad_norm: 0.5171 (0.7379) time: 5.5704 data: 0.0001 max mem: 71357 -[20:49:57.481127] Epoch: [0] [2220/6500] lr: 0.000017 closs: 0.7101 (0.8375) grad_norm: 0.5008 (0.7367) time: 5.5800 data: 0.0002 max mem: 71357 -[20:50:53.173124] Epoch: [0] [2230/6500] lr: 0.000017 closs: 0.7101 (0.8372) grad_norm: 0.4889 (0.7370) time: 5.5780 data: 0.0002 max mem: 71357 -[20:51:48.892801] Epoch: [0] [2240/6500] lr: 0.000017 closs: 0.7136 (0.8366) grad_norm: 0.4889 (0.7363) time: 5.5704 data: 0.0002 max mem: 71357 -[20:52:44.665693] Epoch: [0] [2250/6500] lr: 0.000017 closs: 0.7372 (0.8365) grad_norm: 0.5008 (0.7355) time: 5.5745 data: 0.0002 max mem: 71357 -[20:53:40.495845] Epoch: [0] [2260/6500] lr: 0.000017 closs: 0.7952 (0.8366) grad_norm: 0.5428 (0.7346) time: 5.5800 data: 0.0002 max mem: 71357 -[20:54:36.203059] Epoch: [0] [2270/6500] lr: 0.000017 closs: 0.7883 (0.8365) grad_norm: 0.5428 (0.7345) time: 5.5768 data: 0.0002 max mem: 71357 -[20:55:31.918928] Epoch: [0] [2280/6500] lr: 0.000018 closs: 0.7588 (0.8359) grad_norm: 0.4861 (0.7333) time: 5.5711 data: 0.0002 max mem: 71357 -[20:56:27.730546] Epoch: [0] [2290/6500] lr: 0.000018 closs: 0.7458 (0.8357) grad_norm: 0.4716 (0.7331) time: 5.5763 data: 0.0002 max mem: 71357 -[20:57:23.453980] Epoch: [0] [2300/6500] lr: 0.000018 closs: 0.7519 (0.8351) grad_norm: 0.4951 (0.7329) time: 5.5767 data: 0.0002 max mem: 71357 -[20:58:19.330645] Epoch: [0] [2310/6500] lr: 0.000018 closs: 0.7519 (0.8351) grad_norm: 0.4951 (0.7318) time: 5.5799 data: 0.0001 max mem: 71357 -[20:59:15.049870] Epoch: [0] [2320/6500] lr: 0.000018 closs: 0.7585 (0.8348) grad_norm: 0.5165 (0.7310) time: 5.5797 data: 0.0001 max mem: 71357 -[21:00:10.840941] Epoch: [0] [2330/6500] lr: 0.000018 closs: 0.7507 (0.8345) grad_norm: 0.5165 (0.7304) time: 5.5754 data: 0.0001 max mem: 71357 -[21:01:06.575722] Epoch: [0] [2340/6500] lr: 0.000018 closs: 0.7689 (0.8342) grad_norm: 0.4833 (0.7296) time: 5.5762 data: 0.0002 max mem: 71357 -[21:02:02.422716] Epoch: [0] [2350/6500] lr: 0.000018 closs: 0.7870 (0.8342) grad_norm: 0.4735 (0.7284) time: 5.5790 data: 0.0002 max mem: 71357 -[21:02:58.275189] Epoch: [0] [2360/6500] lr: 0.000018 closs: 0.8006 (0.8342) grad_norm: 0.4750 (0.7274) time: 5.5848 data: 0.0001 max mem: 71357 -[21:03:54.007925] Epoch: [0] [2370/6500] lr: 0.000018 closs: 0.7329 (0.8334) grad_norm: 0.4833 (0.7271) time: 5.5791 data: 0.0001 max mem: 71357 -[21:04:49.752785] Epoch: [0] [2380/6500] lr: 0.000018 closs: 0.7316 (0.8335) grad_norm: 0.4943 (0.7266) time: 5.5738 data: 0.0002 max mem: 71357 -[21:05:45.680885] Epoch: [0] [2390/6500] lr: 0.000018 closs: 0.8211 (0.8335) grad_norm: 0.5098 (0.7261) time: 5.5836 data: 0.0002 max mem: 71357 -[21:06:41.469027] Epoch: [0] [2400/6500] lr: 0.000018 closs: 0.8356 (0.8335) grad_norm: 0.5608 (0.7252) time: 5.5857 data: 0.0001 max mem: 71357 -[21:07:37.229547] Epoch: [0] [2410/6500] lr: 0.000019 closs: 0.7179 (0.8329) grad_norm: 0.5037 (0.7249) time: 5.5773 data: 0.0002 max mem: 71357 -[21:08:32.958561] Epoch: [0] [2420/6500] lr: 0.000019 closs: 0.7157 (0.8327) grad_norm: 0.5037 (0.7241) time: 5.5744 data: 0.0002 max mem: 71357 -[21:09:28.673873] Epoch: [0] [2430/6500] lr: 0.000019 closs: 0.7412 (0.8324) grad_norm: 0.4534 (0.7229) time: 5.5721 data: 0.0002 max mem: 71357 -[21:10:24.504516] Epoch: [0] [2440/6500] lr: 0.000019 closs: 0.8024 (0.8324) grad_norm: 0.4475 (0.7218) time: 5.5772 data: 0.0002 max mem: 71357 -[21:11:20.170214] Epoch: [0] [2450/6500] lr: 0.000019 closs: 0.7852 (0.8321) grad_norm: 0.4525 (0.7210) time: 5.5747 data: 0.0002 max mem: 71357 -[21:12:15.876408] Epoch: [0] [2460/6500] lr: 0.000019 closs: 0.7852 (0.8318) grad_norm: 0.4266 (0.7201) time: 5.5685 data: 0.0002 max mem: 71357 -[21:13:11.617462] Epoch: [0] [2470/6500] lr: 0.000019 closs: 0.8011 (0.8319) grad_norm: 0.4346 (0.7192) time: 5.5723 data: 0.0002 max mem: 71357 -[21:14:07.477465] Epoch: [0] [2480/6500] lr: 0.000019 closs: 0.7495 (0.8318) grad_norm: 0.4759 (0.7184) time: 5.5799 data: 0.0001 max mem: 71357 -[21:15:03.160431] Epoch: [0] [2490/6500] lr: 0.000019 closs: 0.7495 (0.8316) grad_norm: 0.4835 (0.7178) time: 5.5770 data: 0.0002 max mem: 71357 -[21:15:58.872968] Epoch: [0] [2500/6500] lr: 0.000019 closs: 0.7675 (0.8312) grad_norm: 0.4726 (0.7166) time: 5.5697 data: 0.0002 max mem: 71357 -[21:16:54.619091] Epoch: [0] [2510/6500] lr: 0.000019 closs: 0.7675 (0.8312) grad_norm: 0.4883 (0.7158) time: 5.5728 data: 0.0001 max mem: 71357 -[21:17:50.323182] Epoch: [0] [2520/6500] lr: 0.000019 closs: 0.8567 (0.8313) grad_norm: 0.5139 (0.7170) time: 5.5724 data: 0.0001 max mem: 71357 -[21:18:46.181308] Epoch: [0] [2530/6500] lr: 0.000019 closs: 0.8298 (0.8314) grad_norm: 0.5134 (0.7165) time: 5.5780 data: 0.0001 max mem: 71357 -[21:19:41.968783] Epoch: [0] [2540/6500] lr: 0.000020 closs: 0.8169 (0.8311) grad_norm: 0.5156 (0.7158) time: 5.5821 data: 0.0001 max mem: 71357 -[21:20:37.843995] Epoch: [0] [2550/6500] lr: 0.000020 closs: 0.8056 (0.8310) grad_norm: 0.5134 (0.7148) time: 5.5830 data: 0.0002 max mem: 71357 -[21:21:33.609151] Epoch: [0] [2560/6500] lr: 0.000020 closs: 0.7994 (0.8310) grad_norm: 0.5011 (0.7142) time: 5.5819 data: 0.0002 max mem: 71357 -[21:22:29.557411] Epoch: [0] [2570/6500] lr: 0.000020 closs: 0.7904 (0.8309) grad_norm: 0.5156 (0.7135) time: 5.5855 data: 0.0002 max mem: 71357 -[21:23:25.337398] Epoch: [0] [2580/6500] lr: 0.000020 closs: 0.7925 (0.8307) grad_norm: 0.5411 (0.7129) time: 5.5863 data: 0.0002 max mem: 71357 -[21:24:21.118584] Epoch: [0] [2590/6500] lr: 0.000020 closs: 0.7940 (0.8308) grad_norm: 0.5367 (0.7119) time: 5.5779 data: 0.0002 max mem: 71357 -[21:25:16.916564] Epoch: [0] [2600/6500] lr: 0.000020 closs: 0.7737 (0.8306) grad_norm: 0.5209 (0.7115) time: 5.5789 data: 0.0002 max mem: 71357 -[21:26:12.737112] Epoch: [0] [2610/6500] lr: 0.000020 closs: 0.7394 (0.8305) grad_norm: 0.5209 (0.7118) time: 5.5808 data: 0.0002 max mem: 71357 -[21:27:08.510919] Epoch: [0] [2620/6500] lr: 0.000020 closs: 0.7944 (0.8304) grad_norm: 0.4956 (0.7111) time: 5.5796 data: 0.0002 max mem: 71357 -[21:28:04.306907] Epoch: [0] [2630/6500] lr: 0.000020 closs: 0.7944 (0.8303) grad_norm: 0.4956 (0.7102) time: 5.5784 data: 0.0002 max mem: 71357 -[21:29:00.079713] Epoch: [0] [2640/6500] lr: 0.000020 closs: 0.7446 (0.8298) grad_norm: 0.4956 (0.7094) time: 5.5784 data: 0.0002 max mem: 71357 -[21:29:55.866717] Epoch: [0] [2650/6500] lr: 0.000020 closs: 0.7546 (0.8296) grad_norm: 0.4603 (0.7084) time: 5.5779 data: 0.0001 max mem: 71357 -[21:30:51.780482] Epoch: [0] [2660/6500] lr: 0.000020 closs: 0.8542 (0.8297) grad_norm: 0.4237 (0.7076) time: 5.5850 data: 0.0001 max mem: 71357 -[21:31:47.549029] Epoch: [0] [2670/6500] lr: 0.000021 closs: 0.7719 (0.8291) grad_norm: 0.4430 (0.7072) time: 5.5840 data: 0.0001 max mem: 71357 -[21:32:43.367174] Epoch: [0] [2680/6500] lr: 0.000021 closs: 0.7024 (0.8288) grad_norm: 0.4734 (0.7067) time: 5.5792 data: 0.0002 max mem: 71357 -[21:33:39.102598] Epoch: [0] [2690/6500] lr: 0.000021 closs: 0.7855 (0.8287) grad_norm: 0.5127 (0.7059) time: 5.5776 data: 0.0002 max mem: 71357 -[21:34:35.007085] Epoch: [0] [2700/6500] lr: 0.000021 closs: 0.7978 (0.8288) grad_norm: 0.5239 (0.7054) time: 5.5819 data: 0.0002 max mem: 71357 -[21:35:30.772658] Epoch: [0] [2710/6500] lr: 0.000021 closs: 0.7809 (0.8284) grad_norm: 0.5049 (0.7043) time: 5.5834 data: 0.0002 max mem: 71357 -[21:36:26.600799] Epoch: [0] [2720/6500] lr: 0.000021 closs: 0.7460 (0.8281) grad_norm: 0.5109 (0.7037) time: 5.5796 data: 0.0002 max mem: 71357 -[21:37:22.396532] Epoch: [0] [2730/6500] lr: 0.000021 closs: 0.6897 (0.8277) grad_norm: 0.4972 (0.7030) time: 5.5811 data: 0.0001 max mem: 71357 -[21:38:18.216414] Epoch: [0] [2740/6500] lr: 0.000021 closs: 0.7477 (0.8275) grad_norm: 0.4605 (0.7029) time: 5.5806 data: 0.0002 max mem: 71357 -[21:39:14.267187] Epoch: [0] [2750/6500] lr: 0.000021 closs: 0.7866 (0.8273) grad_norm: 0.4752 (0.7020) time: 5.5934 data: 0.0002 max mem: 71357 -[21:40:10.212618] Epoch: [0] [2760/6500] lr: 0.000021 closs: 0.7746 (0.8272) grad_norm: 0.4605 (0.7010) time: 5.5996 data: 0.0002 max mem: 71357 -[21:41:05.958460] Epoch: [0] [2770/6500] lr: 0.000021 closs: 0.7746 (0.8272) grad_norm: 0.4752 (0.7008) time: 5.5844 data: 0.0002 max mem: 71357 -[21:42:01.770293] Epoch: [0] [2780/6500] lr: 0.000021 closs: 0.8009 (0.8272) grad_norm: 0.4752 (0.7002) time: 5.5778 data: 0.0002 max mem: 71357 -[21:42:57.678502] Epoch: [0] [2790/6500] lr: 0.000021 closs: 0.8009 (0.8271) grad_norm: 0.4831 (0.7007) time: 5.5859 data: 0.0002 max mem: 71357 -[21:43:53.448984] Epoch: [0] [2800/6500] lr: 0.000022 closs: 0.8074 (0.8271) grad_norm: 0.5026 (0.7000) time: 5.5838 data: 0.0002 max mem: 71357 -[21:44:49.284849] Epoch: [0] [2810/6500] lr: 0.000022 closs: 0.8074 (0.8270) grad_norm: 0.4758 (0.6991) time: 5.5802 data: 0.0002 max mem: 71357 -[21:45:45.008739] Epoch: [0] [2820/6500] lr: 0.000022 closs: 0.7502 (0.8266) grad_norm: 0.4573 (0.6983) time: 5.5778 data: 0.0002 max mem: 71357 -[21:46:40.941467] Epoch: [0] [2830/6500] lr: 0.000022 closs: 0.7663 (0.8264) grad_norm: 0.4320 (0.6973) time: 5.5827 data: 0.0002 max mem: 71357 -[21:47:36.741840] Epoch: [0] [2840/6500] lr: 0.000022 closs: 0.8014 (0.8264) grad_norm: 0.4285 (0.6973) time: 5.5866 data: 0.0002 max mem: 71357 -[21:48:32.454452] Epoch: [0] [2850/6500] lr: 0.000022 closs: 0.8047 (0.8261) grad_norm: 0.4406 (0.6966) time: 5.5756 data: 0.0001 max mem: 71357 -[21:49:28.166471] Epoch: [0] [2860/6500] lr: 0.000022 closs: 0.6620 (0.8256) grad_norm: 0.4406 (0.6963) time: 5.5711 data: 0.0001 max mem: 71357 -[21:50:23.965017] Epoch: [0] [2870/6500] lr: 0.000022 closs: 0.6841 (0.8253) grad_norm: 0.5207 (0.6958) time: 5.5754 data: 0.0002 max mem: 71357 -[21:51:19.762370] Epoch: [0] [2880/6500] lr: 0.000022 closs: 0.7397 (0.8252) grad_norm: 0.5657 (0.6979) time: 5.5797 data: 0.0002 max mem: 71357 -[21:52:15.502191] Epoch: [0] [2890/6500] lr: 0.000022 closs: 0.8410 (0.8254) grad_norm: 0.5332 (0.6971) time: 5.5767 data: 0.0001 max mem: 71357 -[21:53:11.116475] Epoch: [0] [2900/6500] lr: 0.000022 closs: 0.7739 (0.8251) grad_norm: 0.5285 (0.6964) time: 5.5676 data: 0.0001 max mem: 71357 -[21:54:06.797736] Epoch: [0] [2910/6500] lr: 0.000022 closs: 0.7956 (0.8252) grad_norm: 0.5048 (0.6961) time: 5.5647 data: 0.0001 max mem: 71357 -[21:55:02.572561] Epoch: [0] [2920/6500] lr: 0.000022 closs: 0.8117 (0.8251) grad_norm: 0.4933 (0.6954) time: 5.5727 data: 0.0001 max mem: 71357 -[21:55:58.265640] Epoch: [0] [2930/6500] lr: 0.000023 closs: 0.7544 (0.8247) grad_norm: 0.4933 (0.6957) time: 5.5732 data: 0.0002 max mem: 71357 -[21:56:53.952100] Epoch: [0] [2940/6500] lr: 0.000023 closs: 0.8150 (0.8249) grad_norm: 0.5311 (0.6952) time: 5.5689 data: 0.0002 max mem: 71357 -[21:57:49.722026] Epoch: [0] [2950/6500] lr: 0.000023 closs: 0.8308 (0.8248) grad_norm: 0.5311 (0.6948) time: 5.5727 data: 0.0001 max mem: 71357 -[21:58:45.409923] Epoch: [0] [2960/6500] lr: 0.000023 closs: 0.8063 (0.8248) grad_norm: 0.4842 (0.6945) time: 5.5728 data: 0.0002 max mem: 71357 -[21:59:41.250401] Epoch: [0] [2970/6500] lr: 0.000023 closs: 0.8054 (0.8246) grad_norm: 0.4842 (0.6939) time: 5.5763 data: 0.0002 max mem: 71357 -[22:00:36.996921] Epoch: [0] [2980/6500] lr: 0.000023 closs: 0.7675 (0.8245) grad_norm: 0.4995 (0.6935) time: 5.5793 data: 0.0002 max mem: 71357 -[22:01:32.739821] Epoch: [0] [2990/6500] lr: 0.000023 closs: 0.7084 (0.8242) grad_norm: 0.5038 (0.6933) time: 5.5744 data: 0.0002 max mem: 71357 -[22:02:28.417654] Epoch: [0] [3000/6500] lr: 0.000023 closs: 0.7755 (0.8242) grad_norm: 0.5038 (0.6928) time: 5.5709 data: 0.0002 max mem: 71357 -[22:03:24.195424] Epoch: [0] [3010/6500] lr: 0.000023 closs: 0.7861 (0.8242) grad_norm: 0.5038 (0.6922) time: 5.5727 data: 0.0001 max mem: 71357 -[22:04:19.925026] Epoch: [0] [3020/6500] lr: 0.000023 closs: 0.7833 (0.8241) grad_norm: 0.4425 (0.6914) time: 5.5753 data: 0.0001 max mem: 71357 -[22:05:15.625126] Epoch: [0] [3030/6500] lr: 0.000023 closs: 0.8012 (0.8242) grad_norm: 0.4276 (0.6903) time: 5.5713 data: 0.0002 max mem: 71357 -[22:06:11.301289] Epoch: [0] [3040/6500] lr: 0.000023 closs: 0.8578 (0.8245) grad_norm: 0.4276 (0.6900) time: 5.5687 data: 0.0002 max mem: 71357 -[22:07:07.112997] Epoch: [0] [3050/6500] lr: 0.000023 closs: 0.7844 (0.8243) grad_norm: 0.4425 (0.6896) time: 5.5743 data: 0.0001 max mem: 71357 -[22:08:02.885455] Epoch: [0] [3060/6500] lr: 0.000024 closs: 0.7363 (0.8239) grad_norm: 0.4420 (0.6890) time: 5.5791 data: 0.0001 max mem: 71357 -[22:08:58.574922] Epoch: [0] [3070/6500] lr: 0.000024 closs: 0.7478 (0.8239) grad_norm: 0.4767 (0.6882) time: 5.5730 data: 0.0001 max mem: 71357 -[22:09:54.218364] Epoch: [0] [3080/6500] lr: 0.000024 closs: 0.7478 (0.8238) grad_norm: 0.4767 (0.6877) time: 5.5666 data: 0.0001 max mem: 71357 -[22:10:49.953148] Epoch: [0] [3090/6500] lr: 0.000024 closs: 0.8290 (0.8239) grad_norm: 0.4707 (0.6872) time: 5.5688 data: 0.0002 max mem: 71357 -[22:11:45.777210] Epoch: [0] [3100/6500] lr: 0.000024 closs: 0.8290 (0.8241) grad_norm: 0.5149 (0.6872) time: 5.5778 data: 0.0002 max mem: 71357 -[22:12:41.446476] Epoch: [0] [3110/6500] lr: 0.000024 closs: 0.7752 (0.8239) grad_norm: 0.5202 (0.6871) time: 5.5746 data: 0.0002 max mem: 71357 -[22:13:37.138604] Epoch: [0] [3120/6500] lr: 0.000024 closs: 0.7729 (0.8238) grad_norm: 0.5083 (0.6864) time: 5.5680 data: 0.0001 max mem: 71357 -[22:14:32.885980] Epoch: [0] [3130/6500] lr: 0.000024 closs: 0.7892 (0.8237) grad_norm: 0.5083 (0.6860) time: 5.5719 data: 0.0001 max mem: 71357 -[22:15:28.734038] Epoch: [0] [3140/6500] lr: 0.000024 closs: 0.7201 (0.8233) grad_norm: 0.4951 (0.6855) time: 5.5797 data: 0.0001 max mem: 71357 -[22:16:24.401364] Epoch: [0] [3150/6500] lr: 0.000024 closs: 0.7225 (0.8231) grad_norm: 0.4951 (0.6855) time: 5.5757 data: 0.0001 max mem: 71357 -[22:17:20.190194] Epoch: [0] [3160/6500] lr: 0.000024 closs: 0.7225 (0.8227) grad_norm: 0.5478 (0.6851) time: 5.5727 data: 0.0002 max mem: 71357 -[22:18:15.930741] Epoch: [0] [3170/6500] lr: 0.000024 closs: 0.6877 (0.8226) grad_norm: 0.5478 (0.6849) time: 5.5764 data: 0.0002 max mem: 71357 -[22:19:11.777748] Epoch: [0] [3180/6500] lr: 0.000024 closs: 0.7892 (0.8225) grad_norm: 0.5051 (0.6841) time: 5.5793 data: 0.0002 max mem: 71357 -[22:20:07.668122] Epoch: [0] [3190/6500] lr: 0.000025 closs: 0.7907 (0.8225) grad_norm: 0.4834 (0.6836) time: 5.5868 data: 0.0002 max mem: 71357 -[22:21:03.530895] Epoch: [0] [3200/6500] lr: 0.000025 closs: 0.7862 (0.8223) grad_norm: 0.4458 (0.6829) time: 5.5876 data: 0.0002 max mem: 71357 -[22:21:59.265653] Epoch: [0] [3210/6500] lr: 0.000025 closs: 0.8013 (0.8224) grad_norm: 0.4373 (0.6825) time: 5.5798 data: 0.0001 max mem: 71357 -[22:22:54.998366] Epoch: [0] [3220/6500] lr: 0.000025 closs: 0.8076 (0.8222) grad_norm: 0.4758 (0.6823) time: 5.5733 data: 0.0002 max mem: 71357 -[22:23:50.905774] Epoch: [0] [3230/6500] lr: 0.000025 closs: 0.7747 (0.8219) grad_norm: 0.4758 (0.6816) time: 5.5819 data: 0.0002 max mem: 71357 -[22:24:46.627784] Epoch: [0] [3240/6500] lr: 0.000025 closs: 0.7195 (0.8217) grad_norm: 0.4976 (0.6818) time: 5.5813 data: 0.0002 max mem: 71357 -[22:25:42.340275] Epoch: [0] [3250/6500] lr: 0.000025 closs: 0.8083 (0.8220) grad_norm: 0.4758 (0.6814) time: 5.5716 data: 0.0002 max mem: 71357 -[22:26:38.235003] Epoch: [0] [3260/6500] lr: 0.000025 closs: 0.8261 (0.8221) grad_norm: 0.4669 (0.6806) time: 5.5802 data: 0.0002 max mem: 71357 -[22:27:34.143033] Epoch: [0] [3270/6500] lr: 0.000025 closs: 0.7940 (0.8220) grad_norm: 0.4716 (0.6802) time: 5.5900 data: 0.0002 max mem: 71357 -[22:28:29.848989] Epoch: [0] [3280/6500] lr: 0.000025 closs: 0.7949 (0.8218) grad_norm: 0.4669 (0.6798) time: 5.5806 data: 0.0002 max mem: 71357 -[22:29:25.534312] Epoch: [0] [3290/6500] lr: 0.000025 closs: 0.7710 (0.8220) grad_norm: 0.4918 (0.6792) time: 5.5694 data: 0.0002 max mem: 71357 -[22:30:21.210353] Epoch: [0] [3300/6500] lr: 0.000025 closs: 0.7941 (0.8220) grad_norm: 0.4949 (0.6792) time: 5.5679 data: 0.0002 max mem: 71357 -[22:31:16.927815] Epoch: [0] [3310/6500] lr: 0.000025 closs: 0.7941 (0.8216) grad_norm: 0.4980 (0.6788) time: 5.5695 data: 0.0002 max mem: 71357 -[22:32:12.719534] Epoch: [0] [3320/6500] lr: 0.000026 closs: 0.6892 (0.8212) grad_norm: 0.4980 (0.6783) time: 5.5753 data: 0.0001 max mem: 71357 -[22:33:08.513920] Epoch: [0] [3330/6500] lr: 0.000026 closs: 0.7701 (0.8211) grad_norm: 0.5174 (0.6812) time: 5.5792 data: 0.0001 max mem: 71357 -[22:34:04.304134] Epoch: [0] [3340/6500] lr: 0.000026 closs: 0.7871 (0.8210) grad_norm: 0.4950 (0.6806) time: 5.5791 data: 0.0001 max mem: 71357 -[22:35:00.047708] Epoch: [0] [3350/6500] lr: 0.000026 closs: 0.7350 (0.8207) grad_norm: 0.4521 (0.6801) time: 5.5766 data: 0.0001 max mem: 71357 -[22:35:55.874424] Epoch: [0] [3360/6500] lr: 0.000026 closs: 0.7149 (0.8205) grad_norm: 0.4950 (0.6811) time: 5.5784 data: 0.0002 max mem: 71357 -[22:36:51.608865] Epoch: [0] [3370/6500] lr: 0.000026 closs: 0.6943 (0.8199) grad_norm: 0.4521 (0.6807) time: 5.5779 data: 0.0002 max mem: 71357 -[22:37:47.359747] Epoch: [0] [3380/6500] lr: 0.000026 closs: 0.7499 (0.8199) grad_norm: 0.5135 (0.6802) time: 5.5741 data: 0.0002 max mem: 71357 -[22:38:43.055778] Epoch: [0] [3390/6500] lr: 0.000026 closs: 0.7804 (0.8198) grad_norm: 0.5695 (0.6800) time: 5.5722 data: 0.0002 max mem: 71357 -[22:39:38.791189] Epoch: [0] [3400/6500] lr: 0.000026 closs: 0.7784 (0.8198) grad_norm: 0.5695 (0.6806) time: 5.5714 data: 0.0002 max mem: 71357 -[22:40:34.701703] Epoch: [0] [3410/6500] lr: 0.000026 closs: 0.8139 (0.8198) grad_norm: 0.5514 (0.6797) time: 5.5822 data: 0.0002 max mem: 71357 -[22:41:30.512183] Epoch: [0] [3420/6500] lr: 0.000026 closs: 0.7679 (0.8196) grad_norm: 0.5098 (0.6790) time: 5.5860 data: 0.0002 max mem: 71357 -[22:42:26.256064] Epoch: [0] [3430/6500] lr: 0.000026 closs: 0.7416 (0.8196) grad_norm: 0.4407 (0.6784) time: 5.5776 data: 0.0002 max mem: 71357 -[22:43:21.943934] Epoch: [0] [3440/6500] lr: 0.000026 closs: 0.6919 (0.8193) grad_norm: 0.3898 (0.6776) time: 5.5714 data: 0.0002 max mem: 71357 -[22:44:17.761189] Epoch: [0] [3450/6500] lr: 0.000027 closs: 0.7952 (0.8193) grad_norm: 0.3898 (0.6772) time: 5.5751 data: 0.0002 max mem: 71357 -[22:45:13.506509] Epoch: [0] [3460/6500] lr: 0.000027 closs: 0.8043 (0.8192) grad_norm: 0.3898 (0.6770) time: 5.5780 data: 0.0001 max mem: 71357 -[22:46:09.269380] Epoch: [0] [3470/6500] lr: 0.000027 closs: 0.7872 (0.8192) grad_norm: 0.4201 (0.6766) time: 5.5753 data: 0.0002 max mem: 71357 -[22:47:04.958374] Epoch: [0] [3480/6500] lr: 0.000027 closs: 0.7538 (0.8190) grad_norm: 0.4742 (0.6763) time: 5.5725 data: 0.0002 max mem: 71357 -[22:48:00.838506] Epoch: [0] [3490/6500] lr: 0.000027 closs: 0.7062 (0.8188) grad_norm: 0.4572 (0.6762) time: 5.5784 data: 0.0001 max mem: 71357 -[22:48:56.554925] Epoch: [0] [3500/6500] lr: 0.000027 closs: 0.7649 (0.8187) grad_norm: 0.4557 (0.6755) time: 5.5797 data: 0.0001 max mem: 71357 -[22:49:52.356055] Epoch: [0] [3510/6500] lr: 0.000027 closs: 0.7740 (0.8186) grad_norm: 0.4557 (0.6749) time: 5.5757 data: 0.0001 max mem: 71357 -[22:50:48.051775] Epoch: [0] [3520/6500] lr: 0.000027 closs: 0.6907 (0.8184) grad_norm: 0.4465 (0.6745) time: 5.5747 data: 0.0002 max mem: 71357 -[22:51:43.736248] Epoch: [0] [3530/6500] lr: 0.000027 closs: 0.7185 (0.8184) grad_norm: 0.4883 (0.6741) time: 5.5689 data: 0.0001 max mem: 71357 -[22:52:39.582328] Epoch: [0] [3540/6500] lr: 0.000027 closs: 0.7792 (0.8182) grad_norm: 0.4887 (0.6737) time: 5.5765 data: 0.0001 max mem: 71357 -[22:53:35.217988] Epoch: [0] [3550/6500] lr: 0.000027 closs: 0.7264 (0.8181) grad_norm: 0.4923 (0.6732) time: 5.5740 data: 0.0001 max mem: 71357 -[22:54:31.016009] Epoch: [0] [3560/6500] lr: 0.000027 closs: 0.7264 (0.8179) grad_norm: 0.4923 (0.6727) time: 5.5716 data: 0.0002 max mem: 71357 -[22:55:26.802300] Epoch: [0] [3570/6500] lr: 0.000027 closs: 0.7620 (0.8178) grad_norm: 0.4646 (0.6722) time: 5.5791 data: 0.0002 max mem: 71357 -[22:56:22.687278] Epoch: [0] [3580/6500] lr: 0.000028 closs: 0.7688 (0.8178) grad_norm: 0.4572 (0.6717) time: 5.5835 data: 0.0001 max mem: 71357 -[22:57:18.492948] Epoch: [0] [3590/6500] lr: 0.000028 closs: 0.8586 (0.8179) grad_norm: 0.4335 (0.6714) time: 5.5844 data: 0.0002 max mem: 71357 -[22:58:14.266317] Epoch: [0] [3600/6500] lr: 0.000028 closs: 0.8586 (0.8181) grad_norm: 0.4301 (0.6708) time: 5.5788 data: 0.0002 max mem: 71357 -[22:59:10.030708] Epoch: [0] [3610/6500] lr: 0.000028 closs: 0.7420 (0.8178) grad_norm: 0.4168 (0.6702) time: 5.5767 data: 0.0001 max mem: 71357 -[23:00:05.740740] Epoch: [0] [3620/6500] lr: 0.000028 closs: 0.6845 (0.8177) grad_norm: 0.4147 (0.6697) time: 5.5736 data: 0.0002 max mem: 71357 -[23:01:01.628084] Epoch: [0] [3630/6500] lr: 0.000028 closs: 0.7360 (0.8176) grad_norm: 0.4531 (0.6693) time: 5.5798 data: 0.0002 max mem: 71357 -[23:01:57.312973] Epoch: [0] [3640/6500] lr: 0.000028 closs: 0.7370 (0.8177) grad_norm: 0.4726 (0.6689) time: 5.5785 data: 0.0001 max mem: 71357 -[23:02:53.018097] Epoch: [0] [3650/6500] lr: 0.000028 closs: 0.7370 (0.8174) grad_norm: 0.4726 (0.6684) time: 5.5694 data: 0.0001 max mem: 71357 -[23:03:48.696651] Epoch: [0] [3660/6500] lr: 0.000028 closs: 0.7332 (0.8172) grad_norm: 0.4864 (0.6678) time: 5.5691 data: 0.0001 max mem: 71357 -[23:04:44.522063] Epoch: [0] [3670/6500] lr: 0.000028 closs: 0.7933 (0.8171) grad_norm: 0.4716 (0.6672) time: 5.5751 data: 0.0001 max mem: 71357 -[23:05:40.229307] Epoch: [0] [3680/6500] lr: 0.000028 closs: 0.7823 (0.8170) grad_norm: 0.4653 (0.6668) time: 5.5765 data: 0.0002 max mem: 71357 -[23:06:36.012740] Epoch: [0] [3690/6500] lr: 0.000028 closs: 0.7470 (0.8169) grad_norm: 0.4864 (0.6664) time: 5.5744 data: 0.0002 max mem: 71357 -[23:07:31.710488] Epoch: [0] [3700/6500] lr: 0.000028 closs: 0.7934 (0.8167) grad_norm: 0.4874 (0.6661) time: 5.5740 data: 0.0002 max mem: 71357 -[23:08:27.509042] Epoch: [0] [3710/6500] lr: 0.000029 closs: 0.8132 (0.8167) grad_norm: 0.4970 (0.6656) time: 5.5747 data: 0.0001 max mem: 71357 -[23:09:23.247672] Epoch: [0] [3720/6500] lr: 0.000029 closs: 0.7672 (0.8166) grad_norm: 0.4874 (0.6649) time: 5.5767 data: 0.0001 max mem: 71357 -[23:10:19.028843] Epoch: [0] [3730/6500] lr: 0.000029 closs: 0.8333 (0.8167) grad_norm: 0.4414 (0.6645) time: 5.5758 data: 0.0001 max mem: 71357 -[23:11:14.783761] Epoch: [0] [3740/6500] lr: 0.000029 closs: 0.7984 (0.8165) grad_norm: 0.4400 (0.6643) time: 5.5767 data: 0.0001 max mem: 71357 -[23:12:10.561004] Epoch: [0] [3750/6500] lr: 0.000029 closs: 0.7420 (0.8166) grad_norm: 0.4073 (0.6638) time: 5.5765 data: 0.0001 max mem: 71357 -[23:13:06.422660] Epoch: [0] [3760/6500] lr: 0.000029 closs: 0.7733 (0.8165) grad_norm: 0.3984 (0.6631) time: 5.5818 data: 0.0002 max mem: 71357 -[23:14:02.251523] Epoch: [0] [3770/6500] lr: 0.000029 closs: 0.7733 (0.8165) grad_norm: 0.4193 (0.6628) time: 5.5844 data: 0.0002 max mem: 71357 -[23:14:57.959067] Epoch: [0] [3780/6500] lr: 0.000029 closs: 0.7491 (0.8164) grad_norm: 0.4418 (0.6623) time: 5.5767 data: 0.0002 max mem: 71357 -[23:15:53.761394] Epoch: [0] [3790/6500] lr: 0.000029 closs: 0.7587 (0.8163) grad_norm: 0.4418 (0.6619) time: 5.5753 data: 0.0003 max mem: 71357 -[23:16:49.649809] Epoch: [0] [3800/6500] lr: 0.000029 closs: 0.7967 (0.8163) grad_norm: 0.4321 (0.6611) time: 5.5844 data: 0.0002 max mem: 71357 -[23:17:45.356972] Epoch: [0] [3810/6500] lr: 0.000029 closs: 0.7972 (0.8161) grad_norm: 0.4144 (0.6607) time: 5.5797 data: 0.0001 max mem: 71357 -[23:18:41.088973] Epoch: [0] [3820/6500] lr: 0.000029 closs: 0.7616 (0.8161) grad_norm: 0.4144 (0.6602) time: 5.5719 data: 0.0001 max mem: 71357 -[23:19:36.728934] Epoch: [0] [3830/6500] lr: 0.000029 closs: 0.7773 (0.8161) grad_norm: 0.4110 (0.6597) time: 5.5685 data: 0.0001 max mem: 71357 -[23:20:32.508264] Epoch: [0] [3840/6500] lr: 0.000030 closs: 0.7510 (0.8158) grad_norm: 0.4507 (0.6591) time: 5.5708 data: 0.0002 max mem: 71357 -[23:21:28.314526] Epoch: [0] [3850/6500] lr: 0.000030 closs: 0.7856 (0.8159) grad_norm: 0.4618 (0.6590) time: 5.5792 data: 0.0002 max mem: 71357 -[23:22:24.127339] Epoch: [0] [3860/6500] lr: 0.000030 closs: 0.7705 (0.8157) grad_norm: 0.4486 (0.6583) time: 5.5809 data: 0.0001 max mem: 71357 -[23:23:19.774326] Epoch: [0] [3870/6500] lr: 0.000030 closs: 0.7342 (0.8156) grad_norm: 0.4285 (0.6577) time: 5.5729 data: 0.0001 max mem: 71357 -[23:24:15.449400] Epoch: [0] [3880/6500] lr: 0.000030 closs: 0.8044 (0.8156) grad_norm: 0.4773 (0.6583) time: 5.5660 data: 0.0001 max mem: 71357 -[23:25:11.308804] Epoch: [0] [3890/6500] lr: 0.000030 closs: 0.8051 (0.8154) grad_norm: 0.4486 (0.6581) time: 5.5766 data: 0.0001 max mem: 71357 -[23:26:07.026136] Epoch: [0] [3900/6500] lr: 0.000030 closs: 0.8188 (0.8154) grad_norm: 0.4390 (0.6577) time: 5.5787 data: 0.0002 max mem: 71357 -[23:27:02.883831] Epoch: [0] [3910/6500] lr: 0.000030 closs: 0.8057 (0.8154) grad_norm: 0.4913 (0.6572) time: 5.5786 data: 0.0002 max mem: 71357 -[23:27:58.627975] Epoch: [0] [3920/6500] lr: 0.000030 closs: 0.7998 (0.8153) grad_norm: 0.4599 (0.6568) time: 5.5800 data: 0.0001 max mem: 71357 -[23:28:54.466339] Epoch: [0] [3930/6500] lr: 0.000030 closs: 0.8312 (0.8152) grad_norm: 0.4599 (0.6564) time: 5.5790 data: 0.0001 max mem: 71357 -[23:29:50.253141] Epoch: [0] [3940/6500] lr: 0.000030 closs: 0.7910 (0.8150) grad_norm: 0.4448 (0.6559) time: 5.5811 data: 0.0002 max mem: 71357 -[23:30:46.049333] Epoch: [0] [3950/6500] lr: 0.000030 closs: 0.7271 (0.8148) grad_norm: 0.4448 (0.6554) time: 5.5790 data: 0.0002 max mem: 71357 -[23:31:41.791787] Epoch: [0] [3960/6500] lr: 0.000030 closs: 0.7542 (0.8148) grad_norm: 0.4852 (0.6553) time: 5.5768 data: 0.0002 max mem: 71357 -[23:32:37.496971] Epoch: [0] [3970/6500] lr: 0.000031 closs: 0.7941 (0.8147) grad_norm: 0.4852 (0.6549) time: 5.5723 data: 0.0002 max mem: 71357 -[23:33:33.374044] Epoch: [0] [3980/6500] lr: 0.000031 closs: 0.7972 (0.8147) grad_norm: 0.4313 (0.6542) time: 5.5790 data: 0.0002 max mem: 71357 -[23:34:29.051290] Epoch: [0] [3990/6500] lr: 0.000031 closs: 0.7962 (0.8146) grad_norm: 0.4339 (0.6539) time: 5.5776 data: 0.0002 max mem: 71357 -[23:35:24.744469] Epoch: [0] [4000/6500] lr: 0.000031 closs: 0.7493 (0.8144) grad_norm: 0.4339 (0.6536) time: 5.5684 data: 0.0002 max mem: 71357 -[23:36:20.520889] Epoch: [0] [4010/6500] lr: 0.000031 closs: 0.7516 (0.8142) grad_norm: 0.4339 (0.6533) time: 5.5733 data: 0.0002 max mem: 71357 -[23:37:16.369144] Epoch: [0] [4020/6500] lr: 0.000031 closs: 0.8050 (0.8143) grad_norm: 0.4571 (0.6530) time: 5.5811 data: 0.0002 max mem: 71357 -[23:38:12.149618] Epoch: [0] [4030/6500] lr: 0.000031 closs: 0.8104 (0.8143) grad_norm: 0.4843 (0.6527) time: 5.5813 data: 0.0002 max mem: 71357 -[23:39:07.897266] Epoch: [0] [4040/6500] lr: 0.000031 closs: 0.7765 (0.8142) grad_norm: 0.4843 (0.6523) time: 5.5763 data: 0.0002 max mem: 71357 -[23:40:03.635775] Epoch: [0] [4050/6500] lr: 0.000031 closs: 0.7657 (0.8142) grad_norm: 0.4843 (0.6518) time: 5.5742 data: 0.0002 max mem: 71357 -[23:40:59.373975] Epoch: [0] [4060/6500] lr: 0.000031 closs: 0.8069 (0.8142) grad_norm: 0.4719 (0.6513) time: 5.5737 data: 0.0002 max mem: 71357 -[23:41:55.240081] Epoch: [0] [4070/6500] lr: 0.000031 closs: 0.8069 (0.8142) grad_norm: 0.4648 (0.6509) time: 5.5801 data: 0.0002 max mem: 71357 -[23:42:51.032317] Epoch: [0] [4080/6500] lr: 0.000031 closs: 0.8569 (0.8142) grad_norm: 0.4648 (0.6508) time: 5.5828 data: 0.0002 max mem: 71357 -[23:43:46.755137] Epoch: [0] [4090/6500] lr: 0.000031 closs: 0.8239 (0.8142) grad_norm: 0.4661 (0.6506) time: 5.5757 data: 0.0002 max mem: 71357 -[23:44:42.512116] Epoch: [0] [4100/6500] lr: 0.000032 closs: 0.7687 (0.8141) grad_norm: 0.4823 (0.6501) time: 5.5739 data: 0.0002 max mem: 71357 -[23:45:38.454009] Epoch: [0] [4110/6500] lr: 0.000032 closs: 0.7388 (0.8139) grad_norm: 0.4725 (0.6495) time: 5.5848 data: 0.0002 max mem: 71357 -[23:46:34.148019] Epoch: [0] [4120/6500] lr: 0.000032 closs: 0.7714 (0.8141) grad_norm: 0.4684 (0.6490) time: 5.5817 data: 0.0002 max mem: 71357 -[23:47:29.964993] Epoch: [0] [4130/6500] lr: 0.000032 closs: 0.7955 (0.8139) grad_norm: 0.4303 (0.6485) time: 5.5754 data: 0.0001 max mem: 71357 -[23:48:25.655582] Epoch: [0] [4140/6500] lr: 0.000032 closs: 0.6973 (0.8136) grad_norm: 0.4611 (0.6488) time: 5.5753 data: 0.0001 max mem: 71357 -[23:49:21.369932] Epoch: [0] [4150/6500] lr: 0.000032 closs: 0.7174 (0.8135) grad_norm: 0.4709 (0.6485) time: 5.5702 data: 0.0001 max mem: 71357 -[23:50:17.071274] Epoch: [0] [4160/6500] lr: 0.000032 closs: 0.7410 (0.8134) grad_norm: 0.4709 (0.6480) time: 5.5707 data: 0.0001 max mem: 71357 -[23:51:12.793801] Epoch: [0] [4170/6500] lr: 0.000032 closs: 0.6819 (0.8131) grad_norm: 0.4439 (0.6479) time: 5.5711 data: 0.0001 max mem: 71357 -[23:52:08.637169] Epoch: [0] [4180/6500] lr: 0.000032 closs: 0.6791 (0.8130) grad_norm: 0.4330 (0.6475) time: 5.5782 data: 0.0001 max mem: 71357 -[23:53:04.424045] Epoch: [0] [4190/6500] lr: 0.000032 closs: 0.6955 (0.8127) grad_norm: 0.4330 (0.6469) time: 5.5814 data: 0.0002 max mem: 71357 -[23:54:00.361724] Epoch: [0] [4200/6500] lr: 0.000032 closs: 0.7177 (0.8125) grad_norm: 0.3919 (0.6463) time: 5.5861 data: 0.0002 max mem: 71357 -[23:54:56.077357] Epoch: [0] [4210/6500] lr: 0.000032 closs: 0.7655 (0.8125) grad_norm: 0.3950 (0.6458) time: 5.5826 data: 0.0002 max mem: 71357 -[23:55:51.890551] Epoch: [0] [4220/6500] lr: 0.000032 closs: 0.7983 (0.8125) grad_norm: 0.3997 (0.6454) time: 5.5764 data: 0.0002 max mem: 71357 -[23:56:47.608587] Epoch: [0] [4230/6500] lr: 0.000033 closs: 0.8221 (0.8125) grad_norm: 0.4214 (0.6449) time: 5.5765 data: 0.0002 max mem: 71357 -[23:57:43.392132] Epoch: [0] [4240/6500] lr: 0.000033 closs: 0.7227 (0.8123) grad_norm: 0.4214 (0.6444) time: 5.5750 data: 0.0001 max mem: 71357 -[23:58:39.182546] Epoch: [0] [4250/6500] lr: 0.000033 closs: 0.7145 (0.8122) grad_norm: 0.4214 (0.6440) time: 5.5786 data: 0.0001 max mem: 71357 -[23:59:34.962503] Epoch: [0] [4260/6500] lr: 0.000033 closs: 0.7968 (0.8123) grad_norm: 0.4069 (0.6434) time: 5.5784 data: 0.0001 max mem: 71357 -[00:00:30.653249] Epoch: [0] [4270/6500] lr: 0.000033 closs: 0.8109 (0.8125) grad_norm: 0.3782 (0.6429) time: 5.5734 data: 0.0001 max mem: 71357 -[00:01:26.507609] Epoch: [0] [4280/6500] lr: 0.000033 closs: 0.8543 (0.8124) grad_norm: 0.3807 (0.6423) time: 5.5772 data: 0.0001 max mem: 71357 -[00:02:22.266367] Epoch: [0] [4290/6500] lr: 0.000033 closs: 0.7564 (0.8124) grad_norm: 0.3910 (0.6420) time: 5.5806 data: 0.0001 max mem: 71357 -[00:03:18.041885] Epoch: [0] [4300/6500] lr: 0.000033 closs: 0.7627 (0.8123) grad_norm: 0.4018 (0.6417) time: 5.5766 data: 0.0001 max mem: 71357 -[00:04:13.802843] Epoch: [0] [4310/6500] lr: 0.000033 closs: 0.7162 (0.8120) grad_norm: 0.4247 (0.6412) time: 5.5767 data: 0.0001 max mem: 71357 -[00:05:09.467895] Epoch: [0] [4320/6500] lr: 0.000033 closs: 0.6992 (0.8118) grad_norm: 0.4258 (0.6409) time: 5.5712 data: 0.0001 max mem: 71357 -[00:06:05.337634] Epoch: [0] [4330/6500] lr: 0.000033 closs: 0.7296 (0.8117) grad_norm: 0.4258 (0.6405) time: 5.5766 data: 0.0002 max mem: 71357 -[00:07:01.137398] Epoch: [0] [4340/6500] lr: 0.000033 closs: 0.7877 (0.8118) grad_norm: 0.4247 (0.6401) time: 5.5834 data: 0.0002 max mem: 71357 -[00:07:56.821784] Epoch: [0] [4350/6500] lr: 0.000033 closs: 0.8040 (0.8117) grad_norm: 0.4431 (0.6400) time: 5.5741 data: 0.0002 max mem: 71357 -[00:08:52.489372] Epoch: [0] [4360/6500] lr: 0.000034 closs: 0.7670 (0.8116) grad_norm: 0.4431 (0.6398) time: 5.5675 data: 0.0001 max mem: 71357 -[00:09:48.335184] Epoch: [0] [4370/6500] lr: 0.000034 closs: 0.7801 (0.8116) grad_norm: 0.4316 (0.6393) time: 5.5755 data: 0.0002 max mem: 71357 -[00:10:44.089817] Epoch: [0] [4380/6500] lr: 0.000034 closs: 0.7409 (0.8114) grad_norm: 0.4314 (0.6389) time: 5.5799 data: 0.0002 max mem: 71357 -[00:11:39.833736] Epoch: [0] [4390/6500] lr: 0.000034 closs: 0.7122 (0.8114) grad_norm: 0.4360 (0.6385) time: 5.5748 data: 0.0002 max mem: 71357 -[00:12:35.503403] Epoch: [0] [4400/6500] lr: 0.000034 closs: 0.7840 (0.8113) grad_norm: 0.4360 (0.6384) time: 5.5705 data: 0.0001 max mem: 71357 -[00:13:31.186127] Epoch: [0] [4410/6500] lr: 0.000034 closs: 0.7011 (0.8111) grad_norm: 0.4577 (0.6383) time: 5.5675 data: 0.0001 max mem: 71357 -[00:14:26.967773] Epoch: [0] [4420/6500] lr: 0.000034 closs: 0.7771 (0.8112) grad_norm: 0.4687 (0.6380) time: 5.5731 data: 0.0001 max mem: 71357 -[00:15:22.663366] Epoch: [0] [4430/6500] lr: 0.000034 closs: 0.7935 (0.8111) grad_norm: 0.4562 (0.6382) time: 5.5738 data: 0.0001 max mem: 71357 -[00:16:18.415629] Epoch: [0] [4440/6500] lr: 0.000034 closs: 0.7374 (0.8107) grad_norm: 0.4562 (0.6377) time: 5.5723 data: 0.0002 max mem: 71357 -[00:17:14.098438] Epoch: [0] [4450/6500] lr: 0.000034 closs: 0.7220 (0.8107) grad_norm: 0.4404 (0.6374) time: 5.5716 data: 0.0002 max mem: 71357 -[00:18:09.928472] Epoch: [0] [4460/6500] lr: 0.000034 closs: 0.7684 (0.8105) grad_norm: 0.4403 (0.6373) time: 5.5755 data: 0.0001 max mem: 71357 -[00:19:05.687721] Epoch: [0] [4470/6500] lr: 0.000034 closs: 0.7288 (0.8105) grad_norm: 0.4834 (0.6370) time: 5.5793 data: 0.0001 max mem: 71357 -[00:20:01.374623] Epoch: [0] [4480/6500] lr: 0.000034 closs: 0.8508 (0.8105) grad_norm: 0.4678 (0.6365) time: 5.5722 data: 0.0001 max mem: 71357 -[00:20:57.038781] Epoch: [0] [4490/6500] lr: 0.000035 closs: 0.8271 (0.8105) grad_norm: 0.4790 (0.6363) time: 5.5674 data: 0.0001 max mem: 71357 -[00:21:52.724399] Epoch: [0] [4500/6500] lr: 0.000035 closs: 0.8136 (0.8104) grad_norm: 0.4834 (0.6360) time: 5.5674 data: 0.0001 max mem: 71357 -[00:22:48.517686] Epoch: [0] [4510/6500] lr: 0.000035 closs: 0.7913 (0.8104) grad_norm: 0.4602 (0.6357) time: 5.5738 data: 0.0001 max mem: 71357 -[00:23:44.328287] Epoch: [0] [4520/6500] lr: 0.000035 closs: 0.7664 (0.8102) grad_norm: 0.4774 (0.6353) time: 5.5801 data: 0.0001 max mem: 71357 -[00:24:40.071329] Epoch: [0] [4530/6500] lr: 0.000035 closs: 0.7084 (0.8101) grad_norm: 0.4480 (0.6351) time: 5.5776 data: 0.0001 max mem: 71357 -[00:25:35.770539] Epoch: [0] [4540/6500] lr: 0.000035 closs: 0.6961 (0.8099) grad_norm: 0.4464 (0.6349) time: 5.5720 data: 0.0001 max mem: 71357 -[00:26:31.605185] Epoch: [0] [4550/6500] lr: 0.000035 closs: 0.7117 (0.8096) grad_norm: 0.4464 (0.6353) time: 5.5766 data: 0.0001 max mem: 71357 -[00:27:27.408895] Epoch: [0] [4560/6500] lr: 0.000035 closs: 0.7221 (0.8096) grad_norm: 0.4406 (0.6349) time: 5.5818 data: 0.0001 max mem: 71357 -[00:28:23.168811] Epoch: [0] [4570/6500] lr: 0.000035 closs: 0.7814 (0.8096) grad_norm: 0.4406 (0.6345) time: 5.5781 data: 0.0001 max mem: 71357 -[00:29:18.810132] Epoch: [0] [4580/6500] lr: 0.000035 closs: 0.7730 (0.8094) grad_norm: 0.4818 (0.6345) time: 5.5700 data: 0.0001 max mem: 71357 -[00:30:14.742733] Epoch: [0] [4590/6500] lr: 0.000035 closs: 0.7002 (0.8092) grad_norm: 0.4623 (0.6342) time: 5.5786 data: 0.0001 max mem: 71357 -[00:31:10.481301] Epoch: [0] [4600/6500] lr: 0.000035 closs: 0.7432 (0.8091) grad_norm: 0.4623 (0.6339) time: 5.5835 data: 0.0001 max mem: 71357 -[00:32:06.158824] Epoch: [0] [4610/6500] lr: 0.000035 closs: 0.7692 (0.8091) grad_norm: 0.5206 (0.6339) time: 5.5707 data: 0.0001 max mem: 71357 -[00:33:01.810286] Epoch: [0] [4620/6500] lr: 0.000036 closs: 0.7397 (0.8088) grad_norm: 0.5155 (0.6335) time: 5.5663 data: 0.0001 max mem: 71357 -[00:33:57.482433] Epoch: [0] [4630/6500] lr: 0.000036 closs: 0.6831 (0.8086) grad_norm: 0.5155 (0.6331) time: 5.5661 data: 0.0001 max mem: 71357 -[00:34:53.309586] Epoch: [0] [4640/6500] lr: 0.000036 closs: 0.6817 (0.8084) grad_norm: 0.5155 (0.6328) time: 5.5748 data: 0.0001 max mem: 71357 -[00:35:49.041555] Epoch: [0] [4650/6500] lr: 0.000036 closs: 0.7070 (0.8081) grad_norm: 0.4219 (0.6324) time: 5.5779 data: 0.0001 max mem: 71357 -[00:36:44.777926] Epoch: [0] [4660/6500] lr: 0.000036 closs: 0.7158 (0.8080) grad_norm: 0.4482 (0.6320) time: 5.5733 data: 0.0001 max mem: 71357 -[00:37:40.419372] Epoch: [0] [4670/6500] lr: 0.000036 closs: 0.7453 (0.8080) grad_norm: 0.4652 (0.6324) time: 5.5688 data: 0.0001 max mem: 71357 -[00:38:36.219244] Epoch: [0] [4680/6500] lr: 0.000036 closs: 0.7802 (0.8080) grad_norm: 0.4763 (0.6324) time: 5.5720 data: 0.0001 max mem: 71357 -[00:39:31.970352] Epoch: [0] [4690/6500] lr: 0.000036 closs: 0.7953 (0.8080) grad_norm: 0.5270 (0.6321) time: 5.5774 data: 0.0001 max mem: 71357 -[00:40:27.645439] Epoch: [0] [4700/6500] lr: 0.000036 closs: 0.7213 (0.8078) grad_norm: 0.5270 (0.6317) time: 5.5712 data: 0.0001 max mem: 71357 -[00:41:23.345485] Epoch: [0] [4710/6500] lr: 0.000036 closs: 0.7213 (0.8077) grad_norm: 0.4877 (0.6315) time: 5.5686 data: 0.0002 max mem: 71357 -[00:42:19.100016] Epoch: [0] [4720/6500] lr: 0.000036 closs: 0.7877 (0.8077) grad_norm: 0.4571 (0.6311) time: 5.5726 data: 0.0002 max mem: 71357 -[00:43:14.868291] Epoch: [0] [4730/6500] lr: 0.000036 closs: 0.8066 (0.8077) grad_norm: 0.4571 (0.6311) time: 5.5761 data: 0.0001 max mem: 71357 -[00:44:10.617607] Epoch: [0] [4740/6500] lr: 0.000036 closs: 0.8107 (0.8077) grad_norm: 0.4426 (0.6306) time: 5.5758 data: 0.0001 max mem: 71357 -[00:45:06.457161] Epoch: [0] [4750/6500] lr: 0.000037 closs: 0.8107 (0.8076) grad_norm: 0.4339 (0.6306) time: 5.5793 data: 0.0001 max mem: 71357 -[00:46:02.224391] Epoch: [0] [4760/6500] lr: 0.000037 closs: 0.7232 (0.8074) grad_norm: 0.4325 (0.6302) time: 5.5802 data: 0.0002 max mem: 71357 -[00:46:58.073901] Epoch: [0] [4770/6500] lr: 0.000037 closs: 0.6645 (0.8072) grad_norm: 0.3619 (0.6297) time: 5.5807 data: 0.0002 max mem: 71357 -[00:47:53.806339] Epoch: [0] [4780/6500] lr: 0.000037 closs: 0.7087 (0.8071) grad_norm: 0.3475 (0.6292) time: 5.5790 data: 0.0001 max mem: 71357 -[00:48:49.478935] Epoch: [0] [4790/6500] lr: 0.000037 closs: 0.8572 (0.8073) grad_norm: 0.4056 (0.6289) time: 5.5701 data: 0.0001 max mem: 71357 -[00:49:45.256418] Epoch: [0] [4800/6500] lr: 0.000037 closs: 0.7900 (0.8071) grad_norm: 0.4126 (0.6285) time: 5.5724 data: 0.0001 max mem: 71357 -[00:50:41.081781] Epoch: [0] [4810/6500] lr: 0.000037 closs: 0.7765 (0.8071) grad_norm: 0.4338 (0.6283) time: 5.5800 data: 0.0001 max mem: 71357 -[00:51:36.744745] Epoch: [0] [4820/6500] lr: 0.000037 closs: 0.8162 (0.8073) grad_norm: 0.4591 (0.6280) time: 5.5743 data: 0.0001 max mem: 71357 -[00:52:32.438026] Epoch: [0] [4830/6500] lr: 0.000037 closs: 0.8205 (0.8073) grad_norm: 0.4549 (0.6276) time: 5.5677 data: 0.0001 max mem: 71357 -[00:53:28.073336] Epoch: [0] [4840/6500] lr: 0.000037 closs: 0.8205 (0.8074) grad_norm: 0.4776 (0.6276) time: 5.5664 data: 0.0001 max mem: 71357 -[00:54:23.863171] Epoch: [0] [4850/6500] lr: 0.000037 closs: 0.8377 (0.8073) grad_norm: 0.4549 (0.6272) time: 5.5712 data: 0.0001 max mem: 71357 -[00:55:19.616326] Epoch: [0] [4860/6500] lr: 0.000037 closs: 0.7501 (0.8072) grad_norm: 0.4700 (0.6269) time: 5.5771 data: 0.0001 max mem: 71357 -[00:56:15.311381] Epoch: [0] [4870/6500] lr: 0.000037 closs: 0.7358 (0.8071) grad_norm: 0.4839 (0.6266) time: 5.5723 data: 0.0002 max mem: 71357 -[00:57:10.987223] Epoch: [0] [4880/6500] lr: 0.000038 closs: 0.7905 (0.8072) grad_norm: 0.4700 (0.6265) time: 5.5684 data: 0.0002 max mem: 71357 -[00:58:06.697477] Epoch: [0] [4890/6500] lr: 0.000038 closs: 0.8178 (0.8072) grad_norm: 0.4622 (0.6262) time: 5.5692 data: 0.0001 max mem: 71357 -[00:59:02.438896] Epoch: [0] [4900/6500] lr: 0.000038 closs: 0.8119 (0.8072) grad_norm: 0.4416 (0.6260) time: 5.5725 data: 0.0001 max mem: 71357 -[00:59:58.097865] Epoch: [0] [4910/6500] lr: 0.000038 closs: 0.7644 (0.8072) grad_norm: 0.4335 (0.6256) time: 5.5699 data: 0.0001 max mem: 71357 -[01:00:53.757132] Epoch: [0] [4920/6500] lr: 0.000038 closs: 0.7307 (0.8071) grad_norm: 0.4416 (0.6254) time: 5.5658 data: 0.0002 max mem: 71357 -[01:01:49.419719] Epoch: [0] [4930/6500] lr: 0.000038 closs: 0.7684 (0.8071) grad_norm: 0.4416 (0.6251) time: 5.5660 data: 0.0002 max mem: 71357 -[01:02:45.018759] Epoch: [0] [4940/6500] lr: 0.000038 closs: 0.8007 (0.8071) grad_norm: 0.4659 (0.6250) time: 5.5630 data: 0.0001 max mem: 71357 -[01:03:40.831339] Epoch: [0] [4950/6500] lr: 0.000038 closs: 0.7903 (0.8070) grad_norm: 0.4948 (0.6247) time: 5.5705 data: 0.0001 max mem: 71357 -[01:04:36.520963] Epoch: [0] [4960/6500] lr: 0.000038 closs: 0.7722 (0.8068) grad_norm: 0.4659 (0.6243) time: 5.5750 data: 0.0001 max mem: 71357 -[01:05:32.153893] Epoch: [0] [4970/6500] lr: 0.000038 closs: 0.7275 (0.8067) grad_norm: 0.4508 (0.6239) time: 5.5660 data: 0.0001 max mem: 71357 -[01:06:27.840249] Epoch: [0] [4980/6500] lr: 0.000038 closs: 0.7662 (0.8067) grad_norm: 0.4232 (0.6235) time: 5.5658 data: 0.0002 max mem: 71357 -[01:07:23.696178] Epoch: [0] [4990/6500] lr: 0.000038 closs: 0.7900 (0.8066) grad_norm: 0.4029 (0.6231) time: 5.5770 data: 0.0002 max mem: 71357 -[01:08:19.291258] Epoch: [0] [5000/6500] lr: 0.000038 closs: 0.6995 (0.8064) grad_norm: 0.4029 (0.6229) time: 5.5725 data: 0.0001 max mem: 71357 -[01:09:14.973960] Epoch: [0] [5010/6500] lr: 0.000039 closs: 0.6936 (0.8064) grad_norm: 0.4029 (0.6225) time: 5.5638 data: 0.0001 max mem: 71357 -[01:10:10.617873] Epoch: [0] [5020/6500] lr: 0.000039 closs: 0.7353 (0.8063) grad_norm: 0.4315 (0.6223) time: 5.5663 data: 0.0001 max mem: 71357 -[01:11:06.449934] Epoch: [0] [5030/6500] lr: 0.000039 closs: 0.8164 (0.8063) grad_norm: 0.4530 (0.6219) time: 5.5737 data: 0.0001 max mem: 71357 -[01:12:02.146675] Epoch: [0] [5040/6500] lr: 0.000039 closs: 0.7967 (0.8062) grad_norm: 0.4530 (0.6215) time: 5.5763 data: 0.0001 max mem: 71357 -[01:12:57.852880] Epoch: [0] [5050/6500] lr: 0.000039 closs: 0.7883 (0.8062) grad_norm: 0.4232 (0.6211) time: 5.5700 data: 0.0001 max mem: 71357 -[01:13:53.575353] Epoch: [0] [5060/6500] lr: 0.000039 closs: 0.7221 (0.8059) grad_norm: 0.4012 (0.6207) time: 5.5713 data: 0.0001 max mem: 71357 -[01:14:49.232927] Epoch: [0] [5070/6500] lr: 0.000039 closs: 0.6839 (0.8059) grad_norm: 0.3975 (0.6202) time: 5.5689 data: 0.0001 max mem: 71357 -[01:15:45.028965] Epoch: [0] [5080/6500] lr: 0.000039 closs: 0.7101 (0.8057) grad_norm: 0.4012 (0.6198) time: 5.5726 data: 0.0001 max mem: 71357 -[01:16:40.801994] Epoch: [0] [5090/6500] lr: 0.000039 closs: 0.7101 (0.8056) grad_norm: 0.3859 (0.6193) time: 5.5783 data: 0.0001 max mem: 71357 -[01:17:36.606893] Epoch: [0] [5100/6500] lr: 0.000039 closs: 0.7536 (0.8056) grad_norm: 0.3690 (0.6189) time: 5.5788 data: 0.0001 max mem: 71357 -[01:18:32.252167] Epoch: [0] [5110/6500] lr: 0.000039 closs: 0.7552 (0.8055) grad_norm: 0.3921 (0.6186) time: 5.5724 data: 0.0001 max mem: 71357 -[01:19:28.118493] Epoch: [0] [5120/6500] lr: 0.000039 closs: 0.7621 (0.8055) grad_norm: 0.3921 (0.6181) time: 5.5755 data: 0.0001 max mem: 71357 -[01:20:23.806918] Epoch: [0] [5130/6500] lr: 0.000039 closs: 0.7900 (0.8054) grad_norm: 0.3971 (0.6178) time: 5.5776 data: 0.0001 max mem: 71357 -[01:21:19.560680] Epoch: [0] [5140/6500] lr: 0.000040 closs: 0.7998 (0.8055) grad_norm: 0.4023 (0.6177) time: 5.5720 data: 0.0002 max mem: 71357 -[01:22:15.265374] Epoch: [0] [5150/6500] lr: 0.000040 closs: 0.8014 (0.8056) grad_norm: 0.4023 (0.6174) time: 5.5728 data: 0.0002 max mem: 71357 -[01:23:10.920939] Epoch: [0] [5160/6500] lr: 0.000040 closs: 0.8151 (0.8056) grad_norm: 0.4297 (0.6172) time: 5.5679 data: 0.0001 max mem: 71357 -[01:24:06.835511] Epoch: [0] [5170/6500] lr: 0.000040 closs: 0.7360 (0.8054) grad_norm: 0.4297 (0.6169) time: 5.5784 data: 0.0001 max mem: 71357 -[01:25:02.500269] Epoch: [0] [5180/6500] lr: 0.000040 closs: 0.7251 (0.8053) grad_norm: 0.4110 (0.6165) time: 5.5789 data: 0.0001 max mem: 71357 -[01:25:58.200527] Epoch: [0] [5190/6500] lr: 0.000040 closs: 0.7061 (0.8051) grad_norm: 0.4110 (0.6162) time: 5.5681 data: 0.0001 max mem: 71357 -[01:26:53.995989] Epoch: [0] [5200/6500] lr: 0.000040 closs: 0.7137 (0.8051) grad_norm: 0.3776 (0.6158) time: 5.5747 data: 0.0001 max mem: 71357 -[01:27:49.727844] Epoch: [0] [5210/6500] lr: 0.000040 closs: 0.7441 (0.8049) grad_norm: 0.3776 (0.6156) time: 5.5762 data: 0.0001 max mem: 71357 -[01:28:45.507070] Epoch: [0] [5220/6500] lr: 0.000040 closs: 0.7577 (0.8049) grad_norm: 0.3804 (0.6153) time: 5.5754 data: 0.0001 max mem: 71357 -[01:29:41.159681] Epoch: [0] [5230/6500] lr: 0.000040 closs: 0.7887 (0.8048) grad_norm: 0.3804 (0.6169) time: 5.5715 data: 0.0001 max mem: 71357 -[01:30:36.804523] Epoch: [0] [5240/6500] lr: 0.000040 closs: 0.7531 (0.8048) grad_norm: 0.4002 (0.6164) time: 5.5648 data: 0.0001 max mem: 71357 -[01:31:32.606137] Epoch: [0] [5250/6500] lr: 0.000040 closs: 0.7744 (0.8048) grad_norm: 0.4071 (0.6161) time: 5.5722 data: 0.0001 max mem: 71357 -[01:32:28.301729] Epoch: [0] [5260/6500] lr: 0.000040 closs: 0.8277 (0.8048) grad_norm: 0.4002 (0.6158) time: 5.5747 data: 0.0001 max mem: 71357 -[01:33:24.112652] Epoch: [0] [5270/6500] lr: 0.000041 closs: 0.7664 (0.8048) grad_norm: 0.3901 (0.6154) time: 5.5752 data: 0.0001 max mem: 71357 -[01:34:19.868194] Epoch: [0] [5280/6500] lr: 0.000041 closs: 0.7340 (0.8046) grad_norm: 0.3901 (0.6151) time: 5.5782 data: 0.0001 max mem: 71357 -[01:35:15.606432] Epoch: [0] [5290/6500] lr: 0.000041 closs: 0.6905 (0.8045) grad_norm: 0.3896 (0.6146) time: 5.5746 data: 0.0001 max mem: 71357 -[01:36:11.383142] Epoch: [0] [5300/6500] lr: 0.000041 closs: 0.7767 (0.8045) grad_norm: 0.3896 (0.6143) time: 5.5757 data: 0.0001 max mem: 71357 -[01:37:07.009457] Epoch: [0] [5310/6500] lr: 0.000041 closs: 0.7287 (0.8044) grad_norm: 0.3921 (0.6145) time: 5.5701 data: 0.0001 max mem: 71357 -[01:38:02.669739] Epoch: [0] [5320/6500] lr: 0.000041 closs: 0.7413 (0.8044) grad_norm: 0.4500 (0.6148) time: 5.5642 data: 0.0001 max mem: 71357 -[01:38:58.422769] Epoch: [0] [5330/6500] lr: 0.000041 closs: 0.7950 (0.8044) grad_norm: 0.4939 (0.6144) time: 5.5706 data: 0.0001 max mem: 71357 -[01:39:54.248251] Epoch: [0] [5340/6500] lr: 0.000041 closs: 0.7414 (0.8043) grad_norm: 0.5255 (0.6143) time: 5.5788 data: 0.0001 max mem: 71357 -[01:40:50.000851] Epoch: [0] [5350/6500] lr: 0.000041 closs: 0.7228 (0.8042) grad_norm: 0.4853 (0.6139) time: 5.5788 data: 0.0001 max mem: 71357 -[01:41:45.707984] Epoch: [0] [5360/6500] lr: 0.000041 closs: 0.7594 (0.8041) grad_norm: 0.4699 (0.6136) time: 5.5729 data: 0.0001 max mem: 71357 -[01:42:41.349859] Epoch: [0] [5370/6500] lr: 0.000041 closs: 0.7594 (0.8040) grad_norm: 0.4699 (0.6133) time: 5.5673 data: 0.0001 max mem: 71357 -[01:43:37.059969] Epoch: [0] [5380/6500] lr: 0.000041 closs: 0.7105 (0.8039) grad_norm: 0.4386 (0.6130) time: 5.5675 data: 0.0001 max mem: 71357 -[01:44:32.852298] Epoch: [0] [5390/6500] lr: 0.000041 closs: 0.7781 (0.8039) grad_norm: 0.4505 (0.6128) time: 5.5750 data: 0.0001 max mem: 71357 -[01:45:28.525634] Epoch: [0] [5400/6500] lr: 0.000042 closs: 0.7665 (0.8038) grad_norm: 0.4386 (0.6126) time: 5.5732 data: 0.0001 max mem: 71357 -[01:46:24.224107] Epoch: [0] [5410/6500] lr: 0.000042 closs: 0.7346 (0.8037) grad_norm: 0.4365 (0.6124) time: 5.5685 data: 0.0002 max mem: 71357 -[01:47:19.853525] Epoch: [0] [5420/6500] lr: 0.000042 closs: 0.7093 (0.8034) grad_norm: 0.5518 (0.6123) time: 5.5663 data: 0.0002 max mem: 71357 -[01:48:15.651981] Epoch: [0] [5430/6500] lr: 0.000042 closs: 0.6989 (0.8033) grad_norm: 0.4783 (0.6121) time: 5.5713 data: 0.0001 max mem: 71357 -[01:49:11.451075] Epoch: [0] [5440/6500] lr: 0.000042 closs: 0.7255 (0.8033) grad_norm: 0.4431 (0.6119) time: 5.5797 data: 0.0001 max mem: 71357 -[01:50:07.107339] Epoch: [0] [5450/6500] lr: 0.000042 closs: 0.7646 (0.8033) grad_norm: 0.4431 (0.6116) time: 5.5726 data: 0.0001 max mem: 71357 -[01:51:02.882550] Epoch: [0] [5460/6500] lr: 0.000042 closs: 0.7432 (0.8032) grad_norm: 0.4143 (0.6113) time: 5.5715 data: 0.0002 max mem: 71357 -[01:51:58.596127] Epoch: [0] [5470/6500] lr: 0.000042 closs: 0.7326 (0.8032) grad_norm: 0.4256 (0.6112) time: 5.5744 data: 0.0002 max mem: 71357 -[01:52:54.291718] Epoch: [0] [5480/6500] lr: 0.000042 closs: 0.7713 (0.8033) grad_norm: 0.4343 (0.6207) time: 5.5704 data: 0.0001 max mem: 71357 -[01:53:50.039357] Epoch: [0] [5490/6500] lr: 0.000042 closs: 0.7713 (0.8033) grad_norm: 0.4256 (0.6203) time: 5.5720 data: 0.0001 max mem: 71357 -[01:54:45.630943] Epoch: [0] [5500/6500] lr: 0.000042 closs: 0.7862 (0.8032) grad_norm: 0.4718 (0.6201) time: 5.5668 data: 0.0001 max mem: 71357 -[01:55:41.348185] Epoch: [0] [5510/6500] lr: 0.000042 closs: 0.7569 (0.8031) grad_norm: 0.4718 (0.6200) time: 5.5653 data: 0.0001 max mem: 71357 -[01:56:37.065250] Epoch: [0] [5520/6500] lr: 0.000042 closs: 0.7050 (0.8030) grad_norm: 0.4453 (0.6197) time: 5.5716 data: 0.0002 max mem: 71357 -[01:57:32.797146] Epoch: [0] [5530/6500] lr: 0.000043 closs: 0.7246 (0.8030) grad_norm: 0.4408 (0.6194) time: 5.5723 data: 0.0002 max mem: 71357 -[01:58:28.499852] Epoch: [0] [5540/6500] lr: 0.000043 closs: 0.7876 (0.8030) grad_norm: 0.4408 (0.6191) time: 5.5716 data: 0.0001 max mem: 71357 -[01:59:24.128893] Epoch: [0] [5550/6500] lr: 0.000043 closs: 0.7692 (0.8030) grad_norm: 0.4469 (0.6190) time: 5.5665 data: 0.0001 max mem: 71357 -[02:00:19.963717] Epoch: [0] [5560/6500] lr: 0.000043 closs: 0.6882 (0.8026) grad_norm: 0.4469 (0.6187) time: 5.5731 data: 0.0001 max mem: 71357 -[02:01:15.615241] Epoch: [0] [5570/6500] lr: 0.000043 closs: 0.7543 (0.8027) grad_norm: 0.4919 (0.6188) time: 5.5742 data: 0.0002 max mem: 71357 -[02:02:11.280741] Epoch: [0] [5580/6500] lr: 0.000043 closs: 0.7857 (0.8028) grad_norm: 0.5595 (0.6187) time: 5.5657 data: 0.0002 max mem: 71357 -[02:03:06.899155] Epoch: [0] [5590/6500] lr: 0.000043 closs: 0.7333 (0.8028) grad_norm: 0.5595 (0.6185) time: 5.5641 data: 0.0001 max mem: 71357 -[02:04:02.557866] Epoch: [0] [5600/6500] lr: 0.000043 closs: 0.7701 (0.8029) grad_norm: 0.4936 (0.6182) time: 5.5637 data: 0.0001 max mem: 71357 -[02:04:58.346233] Epoch: [0] [5610/6500] lr: 0.000043 closs: 0.7756 (0.8028) grad_norm: 0.4655 (0.6179) time: 5.5722 data: 0.0001 max mem: 71357 -[02:05:54.107394] Epoch: [0] [5620/6500] lr: 0.000043 closs: 0.7756 (0.8027) grad_norm: 0.4211 (0.6174) time: 5.5774 data: 0.0001 max mem: 71357 -[02:06:49.764956] Epoch: [0] [5630/6500] lr: 0.000043 closs: 0.7647 (0.8026) grad_norm: 0.4478 (0.6173) time: 5.5708 data: 0.0001 max mem: 71357 -[02:07:45.456020] Epoch: [0] [5640/6500] lr: 0.000043 closs: 0.7647 (0.8025) grad_norm: 0.4073 (0.6169) time: 5.5673 data: 0.0001 max mem: 71357 -[02:08:41.296090] Epoch: [0] [5650/6500] lr: 0.000043 closs: 0.7760 (0.8025) grad_norm: 0.3969 (0.6166) time: 5.5764 data: 0.0001 max mem: 71357 -[02:09:37.090246] Epoch: [0] [5660/6500] lr: 0.000044 closs: 0.7567 (0.8025) grad_norm: 0.4379 (0.6163) time: 5.5816 data: 0.0001 max mem: 71357 -[02:10:32.741323] Epoch: [0] [5670/6500] lr: 0.000044 closs: 0.7353 (0.8024) grad_norm: 0.4004 (0.6161) time: 5.5721 data: 0.0001 max mem: 71357 -[02:11:28.424749] Epoch: [0] [5680/6500] lr: 0.000044 closs: 0.8031 (0.8024) grad_norm: 0.4379 (0.6159) time: 5.5666 data: 0.0002 max mem: 71357 -[02:12:24.285486] Epoch: [0] [5690/6500] lr: 0.000044 closs: 0.7536 (0.8023) grad_norm: 0.4379 (0.6155) time: 5.5771 data: 0.0002 max mem: 71357 -[02:13:19.918884] Epoch: [0] [5700/6500] lr: 0.000044 closs: 0.6936 (0.8021) grad_norm: 0.4161 (0.6153) time: 5.5746 data: 0.0001 max mem: 71357 -[02:14:15.599778] Epoch: [0] [5710/6500] lr: 0.000044 closs: 0.6915 (0.8019) grad_norm: 0.4161 (0.6149) time: 5.5656 data: 0.0001 max mem: 71357 -[02:15:11.331649] Epoch: [0] [5720/6500] lr: 0.000044 closs: 0.7293 (0.8019) grad_norm: 0.4047 (0.6147) time: 5.5705 data: 0.0001 max mem: 71357 -[02:16:07.152160] Epoch: [0] [5730/6500] lr: 0.000044 closs: 0.8008 (0.8020) grad_norm: 0.3836 (0.6144) time: 5.5775 data: 0.0002 max mem: 71357 -[02:17:03.066211] Epoch: [0] [5740/6500] lr: 0.000044 closs: 0.8080 (0.8020) grad_norm: 0.3720 (0.6140) time: 5.5866 data: 0.0002 max mem: 71357 -[02:17:58.785745] Epoch: [0] [5750/6500] lr: 0.000044 closs: 0.7575 (0.8018) grad_norm: 0.3820 (0.6138) time: 5.5816 data: 0.0001 max mem: 71357 -[02:18:54.476351] Epoch: [0] [5760/6500] lr: 0.000044 closs: 0.7069 (0.8017) grad_norm: 0.3731 (0.6134) time: 5.5704 data: 0.0001 max mem: 71357 -[02:19:50.250044] Epoch: [0] [5770/6500] lr: 0.000044 closs: 0.7394 (0.8016) grad_norm: 0.3789 (0.6132) time: 5.5731 data: 0.0001 max mem: 71357 -[02:20:46.124223] Epoch: [0] [5780/6500] lr: 0.000044 closs: 0.7709 (0.8015) grad_norm: 0.3789 (0.6128) time: 5.5823 data: 0.0001 max mem: 71357 -[02:21:41.877082] Epoch: [0] [5790/6500] lr: 0.000045 closs: 0.7787 (0.8014) grad_norm: 0.3731 (0.6127) time: 5.5813 data: 0.0002 max mem: 71357 -[02:22:37.579383] Epoch: [0] [5800/6500] lr: 0.000045 closs: 0.7921 (0.8014) grad_norm: 0.3789 (0.6125) time: 5.5727 data: 0.0002 max mem: 71357 -[02:23:33.334032] Epoch: [0] [5810/6500] lr: 0.000045 closs: 0.7929 (0.8014) grad_norm: 0.3504 (0.6121) time: 5.5727 data: 0.0001 max mem: 71357 -[02:24:29.023544] Epoch: [0] [5820/6500] lr: 0.000045 closs: 0.7977 (0.8014) grad_norm: 0.3907 (0.6119) time: 5.5721 data: 0.0001 max mem: 71357 -[02:25:24.861645] Epoch: [0] [5830/6500] lr: 0.000045 closs: 0.7955 (0.8014) grad_norm: 0.4042 (0.6119) time: 5.5763 data: 0.0001 max mem: 71357 -[02:26:20.652863] Epoch: [0] [5840/6500] lr: 0.000045 closs: 0.7487 (0.8014) grad_norm: 0.3819 (0.6115) time: 5.5813 data: 0.0001 max mem: 71357 -[02:27:16.301506] Epoch: [0] [5850/6500] lr: 0.000045 closs: 0.7196 (0.8012) grad_norm: 0.3847 (0.6113) time: 5.5719 data: 0.0001 max mem: 71357 -[02:28:12.058501] Epoch: [0] [5860/6500] lr: 0.000045 closs: 0.7021 (0.8011) grad_norm: 0.3812 (0.6109) time: 5.5702 data: 0.0001 max mem: 71357 -[02:29:07.841507] Epoch: [0] [5870/6500] lr: 0.000045 closs: 0.7515 (0.8010) grad_norm: 0.3859 (0.6106) time: 5.5769 data: 0.0001 max mem: 71357 -[02:30:03.483436] Epoch: [0] [5880/6500] lr: 0.000045 closs: 0.7809 (0.8010) grad_norm: 0.4351 (0.6106) time: 5.5711 data: 0.0001 max mem: 71357 -[02:30:59.287191] Epoch: [0] [5890/6500] lr: 0.000045 closs: 0.7648 (0.8009) grad_norm: 0.4266 (0.6104) time: 5.5722 data: 0.0001 max mem: 71357 -[02:31:55.068642] Epoch: [0] [5900/6500] lr: 0.000045 closs: 0.7531 (0.8008) grad_norm: 0.4351 (0.6100) time: 5.5791 data: 0.0001 max mem: 71357 -[02:32:50.950525] Epoch: [0] [5910/6500] lr: 0.000045 closs: 0.7531 (0.8007) grad_norm: 0.4113 (0.6096) time: 5.5831 data: 0.0001 max mem: 71357 -[02:33:46.661662] Epoch: [0] [5920/6500] lr: 0.000046 closs: 0.7102 (0.8005) grad_norm: 0.3950 (0.6096) time: 5.5796 data: 0.0001 max mem: 71357 -[02:34:42.406758] Epoch: [0] [5930/6500] lr: 0.000046 closs: 0.7361 (0.8005) grad_norm: 0.3965 (0.6096) time: 5.5727 data: 0.0001 max mem: 71357 -[02:35:38.231017] Epoch: [0] [5940/6500] lr: 0.000046 closs: 0.8309 (0.8006) grad_norm: 0.3901 (0.6092) time: 5.5783 data: 0.0001 max mem: 71357 -[02:36:34.027442] Epoch: [0] [5950/6500] lr: 0.000046 closs: 0.8309 (0.8005) grad_norm: 0.3965 (0.6088) time: 5.5809 data: 0.0002 max mem: 71357 -[02:37:30.000045] Epoch: [0] [5960/6500] lr: 0.000046 closs: 0.7825 (0.8006) grad_norm: 0.3745 (0.6084) time: 5.5883 data: 0.0002 max mem: 71357 -[02:38:25.764491] Epoch: [0] [5970/6500] lr: 0.000046 closs: 0.8335 (0.8005) grad_norm: 0.3616 (0.6081) time: 5.5868 data: 0.0001 max mem: 71357 -[02:39:21.355752] Epoch: [0] [5980/6500] lr: 0.000046 closs: 0.8265 (0.8005) grad_norm: 0.3835 (0.6082) time: 5.5677 data: 0.0001 max mem: 71357 -[02:40:17.146222] Epoch: [0] [5990/6500] lr: 0.000046 closs: 0.7573 (0.8004) grad_norm: 0.3745 (0.6079) time: 5.5690 data: 0.0001 max mem: 71357 -[02:41:12.953638] Epoch: [0] [6000/6500] lr: 0.000046 closs: 0.7502 (0.8004) grad_norm: 0.3969 (0.6075) time: 5.5798 data: 0.0002 max mem: 71357 -[02:42:08.673781] Epoch: [0] [6010/6500] lr: 0.000046 closs: 0.7498 (0.8004) grad_norm: 0.3891 (0.6072) time: 5.5762 data: 0.0002 max mem: 71357 -[02:43:04.453165] Epoch: [0] [6020/6500] lr: 0.000046 closs: 0.7474 (0.8002) grad_norm: 0.3891 (0.6068) time: 5.5749 data: 0.0001 max mem: 71357 -[02:44:00.201410] Epoch: [0] [6030/6500] lr: 0.000046 closs: 0.7241 (0.8002) grad_norm: 0.3969 (0.6066) time: 5.5763 data: 0.0001 max mem: 71357 -[02:44:55.881083] Epoch: [0] [6040/6500] lr: 0.000046 closs: 0.7288 (0.8002) grad_norm: 0.4122 (0.6063) time: 5.5713 data: 0.0001 max mem: 71357 -[02:45:51.715473] Epoch: [0] [6050/6500] lr: 0.000047 closs: 0.7671 (0.8000) grad_norm: 0.4315 (0.6062) time: 5.5756 data: 0.0001 max mem: 71357 -[02:46:47.325180] Epoch: [0] [6060/6500] lr: 0.000047 closs: 0.7667 (0.7999) grad_norm: 0.4321 (0.6060) time: 5.5721 data: 0.0002 max mem: 71357 -[02:47:42.956637] Epoch: [0] [6070/6500] lr: 0.000047 closs: 0.7482 (0.7999) grad_norm: 0.4387 (0.6060) time: 5.5620 data: 0.0002 max mem: 71357 -[02:48:38.729897] Epoch: [0] [6080/6500] lr: 0.000047 closs: 0.7457 (0.7999) grad_norm: 0.4835 (0.6058) time: 5.5701 data: 0.0001 max mem: 71357 -[02:49:34.468997] Epoch: [0] [6090/6500] lr: 0.000047 closs: 0.7455 (0.7998) grad_norm: 0.5015 (0.6058) time: 5.5755 data: 0.0001 max mem: 71357 -[02:50:30.184089] Epoch: [0] [6100/6500] lr: 0.000047 closs: 0.7606 (0.7998) grad_norm: 0.5218 (0.6058) time: 5.5726 data: 0.0001 max mem: 71357 -[02:51:25.929815] Epoch: [0] [6110/6500] lr: 0.000047 closs: 0.8044 (0.7998) grad_norm: 0.4483 (0.6054) time: 5.5729 data: 0.0002 max mem: 71357 -[02:52:21.562615] Epoch: [0] [6120/6500] lr: 0.000047 closs: 0.7751 (0.7998) grad_norm: 0.4406 (0.6052) time: 5.5688 data: 0.0002 max mem: 71357 -[02:53:17.515875] Epoch: [0] [6130/6500] lr: 0.000047 closs: 0.8261 (0.7999) grad_norm: 0.3885 (0.6049) time: 5.5792 data: 0.0001 max mem: 71357 -[02:54:13.223511] Epoch: [0] [6140/6500] lr: 0.000047 closs: 0.8261 (0.7999) grad_norm: 0.3792 (0.6049) time: 5.5830 data: 0.0001 max mem: 71357 -[02:55:08.927662] Epoch: [0] [6150/6500] lr: 0.000047 closs: 0.7420 (0.7997) grad_norm: 0.3861 (0.6051) time: 5.5705 data: 0.0001 max mem: 71357 -[02:56:04.619859] Epoch: [0] [6160/6500] lr: 0.000047 closs: 0.7420 (0.7997) grad_norm: 0.3739 (0.6047) time: 5.5697 data: 0.0001 max mem: 71357 -[02:57:00.344520] Epoch: [0] [6170/6500] lr: 0.000047 closs: 0.7830 (0.7997) grad_norm: 0.3740 (0.6044) time: 5.5707 data: 0.0001 max mem: 71357 -[02:57:56.275405] Epoch: [0] [6180/6500] lr: 0.000048 closs: 0.8380 (0.7997) grad_norm: 0.3740 (0.6042) time: 5.5827 data: 0.0001 max mem: 71357 -[02:58:52.075593] Epoch: [0] [6190/6500] lr: 0.000048 closs: 0.7414 (0.7996) grad_norm: 0.3430 (0.6037) time: 5.5865 data: 0.0001 max mem: 71357 -[02:59:47.687443] Epoch: [0] [6200/6500] lr: 0.000048 closs: 0.7333 (0.7995) grad_norm: 0.3656 (0.6035) time: 5.5705 data: 0.0001 max mem: 71357 -[03:00:43.414828] Epoch: [0] [6210/6500] lr: 0.000048 closs: 0.7467 (0.7997) grad_norm: 0.3530 (0.6032) time: 5.5669 data: 0.0001 max mem: 71357 -[03:01:39.227260] Epoch: [0] [6220/6500] lr: 0.000048 closs: 0.7170 (0.7994) grad_norm: 0.3656 (0.6029) time: 5.5769 data: 0.0001 max mem: 71357 -[03:02:35.020515] Epoch: [0] [6230/6500] lr: 0.000048 closs: 0.6600 (0.7994) grad_norm: 0.4024 (0.6027) time: 5.5802 data: 0.0001 max mem: 71357 -[03:03:30.712438] Epoch: [0] [6240/6500] lr: 0.000048 closs: 0.6600 (0.7993) grad_norm: 0.4024 (0.6024) time: 5.5742 data: 0.0001 max mem: 71357 -[03:04:26.392385] Epoch: [0] [6250/6500] lr: 0.000048 closs: 0.7017 (0.7992) grad_norm: 0.4194 (0.6022) time: 5.5685 data: 0.0001 max mem: 71357 -[03:05:22.220010] Epoch: [0] [6260/6500] lr: 0.000048 closs: 0.7189 (0.7991) grad_norm: 0.4248 (0.6019) time: 5.5753 data: 0.0001 max mem: 71357 -[03:06:18.144391] Epoch: [0] [6270/6500] lr: 0.000048 closs: 0.7910 (0.7992) grad_norm: 0.4248 (0.6016) time: 5.5875 data: 0.0002 max mem: 71357 -[03:07:13.838284] Epoch: [0] [6280/6500] lr: 0.000048 closs: 0.7742 (0.7991) grad_norm: 0.4084 (0.6013) time: 5.5808 data: 0.0002 max mem: 71357 -[03:08:09.622986] Epoch: [0] [6290/6500] lr: 0.000048 closs: 0.7474 (0.7991) grad_norm: 0.3958 (0.6010) time: 5.5738 data: 0.0001 max mem: 71357 -[03:09:05.363224] Epoch: [0] [6300/6500] lr: 0.000048 closs: 0.7197 (0.7990) grad_norm: 0.3744 (0.6008) time: 5.5761 data: 0.0001 max mem: 71357 -[03:10:01.116113] Epoch: [0] [6310/6500] lr: 0.000049 closs: 0.8256 (0.7990) grad_norm: 0.3958 (0.6005) time: 5.5745 data: 0.0001 max mem: 71357 -[03:10:56.828047] Epoch: [0] [6320/6500] lr: 0.000049 closs: 0.8374 (0.7991) grad_norm: 0.3957 (0.6003) time: 5.5731 data: 0.0001 max mem: 71357 -[03:11:52.510798] Epoch: [0] [6330/6500] lr: 0.000049 closs: 0.7012 (0.7988) grad_norm: 0.4162 (0.6001) time: 5.5696 data: 0.0002 max mem: 71357 -[03:12:48.232518] Epoch: [0] [6340/6500] lr: 0.000049 closs: 0.6872 (0.7987) grad_norm: 0.4371 (0.5999) time: 5.5701 data: 0.0002 max mem: 71357 -[03:13:43.960654] Epoch: [0] [6350/6500] lr: 0.000049 closs: 0.7740 (0.7987) grad_norm: 0.4367 (0.5996) time: 5.5724 data: 0.0001 max mem: 71357 -[03:14:39.727757] Epoch: [0] [6360/6500] lr: 0.000049 closs: 0.7274 (0.7985) grad_norm: 0.4371 (0.5994) time: 5.5747 data: 0.0001 max mem: 71357 -[03:15:35.404672] Epoch: [0] [6370/6500] lr: 0.000049 closs: 0.7655 (0.7986) grad_norm: 0.4357 (0.5992) time: 5.5721 data: 0.0001 max mem: 71357 -[03:16:31.087842] Epoch: [0] [6380/6500] lr: 0.000049 closs: 0.7822 (0.7986) grad_norm: 0.4313 (0.5989) time: 5.5679 data: 0.0002 max mem: 71357 -[03:17:26.822736] Epoch: [0] [6390/6500] lr: 0.000049 closs: 0.7659 (0.7985) grad_norm: 0.4313 (0.5989) time: 5.5708 data: 0.0002 max mem: 71357 -[03:18:22.577493] Epoch: [0] [6400/6500] lr: 0.000049 closs: 0.6885 (0.7984) grad_norm: 0.4265 (0.5989) time: 5.5744 data: 0.0001 max mem: 71357 -[03:19:18.330428] Epoch: [0] [6410/6500] lr: 0.000049 closs: 0.7096 (0.7982) grad_norm: 0.4265 (0.5987) time: 5.5753 data: 0.0001 max mem: 71357 -[03:20:14.076963] Epoch: [0] [6420/6500] lr: 0.000049 closs: 0.7532 (0.7982) grad_norm: 0.4265 (0.5984) time: 5.5749 data: 0.0001 max mem: 71357 -[03:21:09.837780] Epoch: [0] [6430/6500] lr: 0.000049 closs: 0.7365 (0.7980) grad_norm: 0.4245 (0.5982) time: 5.5753 data: 0.0001 max mem: 71357 -[03:22:05.632480] Epoch: [0] [6440/6500] lr: 0.000050 closs: 0.7185 (0.7980) grad_norm: 0.4245 (0.5980) time: 5.5777 data: 0.0001 max mem: 71357 -[03:23:01.288584] Epoch: [0] [6450/6500] lr: 0.000050 closs: 0.7666 (0.7979) grad_norm: 0.4007 (0.5978) time: 5.5724 data: 0.0001 max mem: 71357 -[03:23:56.940427] Epoch: [0] [6460/6500] lr: 0.000050 closs: 0.7144 (0.7978) grad_norm: 0.3927 (0.5975) time: 5.5653 data: 0.0001 max mem: 71357 -[03:24:52.591248] Epoch: [0] [6470/6500] lr: 0.000050 closs: 0.7340 (0.7977) grad_norm: 0.4434 (0.5979) time: 5.5650 data: 0.0001 max mem: 71357 -[03:25:48.307940] Epoch: [0] [6480/6500] lr: 0.000050 closs: 0.7340 (0.7977) grad_norm: 0.4148 (0.5983) time: 5.5683 data: 0.0001 max mem: 71357 -[03:26:44.126298] Epoch: [0] [6490/6500] lr: 0.000050 closs: 0.7379 (0.7977) grad_norm: 0.4968 (0.5982) time: 5.5767 data: 0.0002 max mem: 71357 -[03:27:34.661033] Epoch: [0] Total time: 10:04:03 -[03:27:34.691953] Averaged stats: lr: 0.000050 closs: 0.7927 (0.7980) grad_norm: 0.4559 (0.5980) -/data/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py:1110: UserWarning: ``FullyShardedDataParallel.full_optim_state_dict``is being deprecated and is replaced by ``FullyShardedDataParallel.optim_state_dict``. ``FullyShardedDataParallel.full_optim_state_dict`` may be removed after PyTorch 2.2. - warnings.warn( -[03:27:34.851776] model saved -/data/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py:1110: UserWarning: ``FullyShardedDataParallel.full_optim_state_dict``is being deprecated and is replaced by ``FullyShardedDataParallel.optim_state_dict``. ``FullyShardedDataParallel.full_optim_state_dict`` may be removed after PyTorch 2.2. - warnings.warn( -[03:27:35.757911] optimizer saved -[03:27:35.758397] other rank-common saved -[03:27:35.761691] rank-specific saved -[03:27:35.770694] log_dir: ./output_dir -[03:27:43.916967] Epoch: [1] [0/6500] lr: 0.000050 closs: 0.6486 (0.6486) time: 8.1455 data: 2.5052 max mem: 71357 -[03:28:39.555610] Epoch: [1] [10/6500] lr: 0.000050 closs: 0.7239 (0.7205) grad_norm: 0.3962 (0.4080) time: 5.7985 data: 0.2279 max mem: 71357 -[03:29:35.233876] Epoch: [1] [20/6500] lr: 0.000050 closs: 0.8120 (0.7878) grad_norm: 0.4027 (0.4338) time: 5.5657 data: 0.0001 max mem: 71357 -[03:30:31.042993] Epoch: [1] [30/6500] lr: 0.000050 closs: 0.8128 (0.7810) grad_norm: 0.4141 (0.4367) time: 5.5743 data: 0.0001 max mem: 71357 -[03:31:26.778182] Epoch: [1] [40/6500] lr: 0.000050 closs: 0.7600 (0.7738) grad_norm: 0.4286 (0.4379) time: 5.5771 data: 0.0001 max mem: 71357 -[03:32:22.493128] Epoch: [1] [50/6500] lr: 0.000050 closs: 0.7715 (0.7786) grad_norm: 0.4286 (0.4348) time: 5.5724 data: 0.0001 max mem: 71357 -[03:33:18.164718] Epoch: [1] [60/6500] lr: 0.000050 closs: 0.7715 (0.7727) grad_norm: 0.4141 (0.4459) time: 5.5692 data: 0.0001 max mem: 71357 -[03:34:13.800199] Epoch: [1] [70/6500] lr: 0.000050 closs: 0.7570 (0.7741) grad_norm: 0.4129 (0.4442) time: 5.5652 data: 0.0001 max mem: 71357 -[03:35:09.583280] Epoch: [1] [80/6500] lr: 0.000050 closs: 0.7335 (0.7625) grad_norm: 0.4129 (0.4435) time: 5.5708 data: 0.0001 max mem: 71357 -[03:36:05.465429] Epoch: [1] [90/6500] lr: 0.000050 closs: 0.7385 (0.7681) grad_norm: 0.4266 (0.4414) time: 5.5832 data: 0.0001 max mem: 71357 -[03:37:01.196399] Epoch: [1] [100/6500] lr: 0.000050 closs: 0.7711 (0.7741) grad_norm: 0.3775 (0.4462) time: 5.5806 data: 0.0001 max mem: 71357 -[03:37:56.957142] Epoch: [1] [110/6500] lr: 0.000050 closs: 0.7738 (0.7750) grad_norm: 0.3745 (0.4366) time: 5.5745 data: 0.0001 max mem: 71357 -[03:38:52.693091] Epoch: [1] [120/6500] lr: 0.000050 closs: 0.7361 (0.7703) grad_norm: 0.3729 (0.4461) time: 5.5747 data: 0.0001 max mem: 71357 -[03:39:48.482751] Epoch: [1] [130/6500] lr: 0.000050 closs: 0.7085 (0.7608) grad_norm: 0.3710 (0.4497) time: 5.5762 data: 0.0001 max mem: 71357 -[03:40:44.069389] Epoch: [1] [140/6500] lr: 0.000050 closs: 0.6935 (0.7555) grad_norm: 0.3846 (0.4603) time: 5.5687 data: 0.0001 max mem: 71357 -[03:41:39.808776] Epoch: [1] [150/6500] lr: 0.000050 closs: 0.7341 (0.7603) grad_norm: 0.3978 (0.4595) time: 5.5662 data: 0.0001 max mem: 71357 -[03:42:35.446981] Epoch: [1] [160/6500] lr: 0.000050 closs: 0.8180 (0.7645) grad_norm: 0.4602 (0.4629) time: 5.5688 data: 0.0001 max mem: 71357 -[03:43:31.177187] Epoch: [1] [170/6500] lr: 0.000050 closs: 0.7881 (0.7646) grad_norm: 0.4495 (0.4614) time: 5.5683 data: 0.0001 max mem: 71357 -[03:44:26.897683] Epoch: [1] [180/6500] lr: 0.000050 closs: 0.8055 (0.7686) grad_norm: 0.4495 (0.4677) time: 5.5724 data: 0.0001 max mem: 71357 -[03:45:22.545885] Epoch: [1] [190/6500] lr: 0.000050 closs: 0.7508 (0.7662) grad_norm: 0.4817 (0.4761) time: 5.5683 data: 0.0001 max mem: 71357 -[03:46:18.218652] Epoch: [1] [200/6500] lr: 0.000050 closs: 0.7508 (0.7668) grad_norm: 0.4273 (0.4732) time: 5.5660 data: 0.0001 max mem: 71357 -[03:47:13.969467] Epoch: [1] [210/6500] lr: 0.000050 closs: 0.7564 (0.7684) grad_norm: 0.4665 (0.4759) time: 5.5711 data: 0.0001 max mem: 71357 -[03:48:09.716365] Epoch: [1] [220/6500] lr: 0.000050 closs: 0.7529 (0.7664) grad_norm: 0.4229 (0.4752) time: 5.5748 data: 0.0001 max mem: 71357 -[03:49:05.445622] Epoch: [1] [230/6500] lr: 0.000050 closs: 0.7457 (0.7659) grad_norm: 0.3906 (0.4711) time: 5.5737 data: 0.0001 max mem: 71357 -[03:50:01.108007] Epoch: [1] [240/6500] lr: 0.000050 closs: 0.7287 (0.7642) grad_norm: 0.4106 (0.4707) time: 5.5695 data: 0.0001 max mem: 71357 -[03:50:56.937160] Epoch: [1] [250/6500] lr: 0.000050 closs: 0.7201 (0.7644) grad_norm: 0.4106 (0.6624) time: 5.5745 data: 0.0001 max mem: 71357 -[03:51:52.784701] Epoch: [1] [260/6500] lr: 0.000050 closs: 0.7706 (0.7645) grad_norm: 0.4106 (0.6523) time: 5.5837 data: 0.0001 max mem: 71357 -[03:52:48.462413] Epoch: [1] [270/6500] lr: 0.000050 closs: 0.7976 (0.7668) grad_norm: 0.4267 (0.6459) time: 5.5761 data: 0.0001 max mem: 71357 -[03:53:44.119411] Epoch: [1] [280/6500] lr: 0.000050 closs: 0.7421 (0.7669) grad_norm: 0.4267 (0.6423) time: 5.5666 data: 0.0002 max mem: 71357 -[03:54:39.776151] Epoch: [1] [290/6500] lr: 0.000050 closs: 0.7414 (0.7677) grad_norm: 0.4362 (0.6379) time: 5.5656 data: 0.0002 max mem: 71357 -[03:55:35.536753] Epoch: [1] [300/6500] lr: 0.000050 closs: 0.7593 (0.7666) grad_norm: 0.5156 (0.6368) time: 5.5707 data: 0.0001 max mem: 71357 -[03:56:31.295194] Epoch: [1] [310/6500] lr: 0.000050 closs: 0.7351 (0.7641) grad_norm: 0.4603 (0.6320) time: 5.5758 data: 0.0001 max mem: 71357 -[03:57:26.899484] Epoch: [1] [320/6500] lr: 0.000050 closs: 0.7863 (0.7665) grad_norm: 0.4362 (0.6254) time: 5.5680 data: 0.0001 max mem: 71357 -[03:58:22.621113] Epoch: [1] [330/6500] lr: 0.000050 closs: 0.7725 (0.7660) grad_norm: 0.4302 (0.6213) time: 5.5662 data: 0.0002 max mem: 71357 -[03:59:18.439876] Epoch: [1] [340/6500] lr: 0.000050 closs: 0.7531 (0.7661) grad_norm: 0.3934 (0.6146) time: 5.5769 data: 0.0002 max mem: 71357 -[04:00:14.348413] Epoch: [1] [350/6500] lr: 0.000050 closs: 0.7826 (0.7659) grad_norm: 0.3847 (0.6065) time: 5.5862 data: 0.0001 max mem: 71357 -[04:01:09.980901] Epoch: [1] [360/6500] lr: 0.000050 closs: 0.8143 (0.7690) grad_norm: 0.3870 (0.6030) time: 5.5769 data: 0.0001 max mem: 71357 -[04:02:05.648563] Epoch: [1] [370/6500] lr: 0.000050 closs: 0.7969 (0.7674) grad_norm: 0.3870 (0.6001) time: 5.5649 data: 0.0001 max mem: 71357 -[04:03:01.348936] Epoch: [1] [380/6500] lr: 0.000050 closs: 0.7290 (0.7683) grad_norm: 0.4082 (0.5945) time: 5.5683 data: 0.0001 max mem: 71357 -[04:03:57.052644] Epoch: [1] [390/6500] lr: 0.000050 closs: 0.7591 (0.7676) grad_norm: 0.4110 (0.5896) time: 5.5701 data: 0.0001 max mem: 71357 -[04:04:52.824329] Epoch: [1] [400/6500] lr: 0.000050 closs: 0.7321 (0.7674) grad_norm: 0.3754 (0.5860) time: 5.5736 data: 0.0001 max mem: 71357 -[04:05:48.442381] Epoch: [1] [410/6500] lr: 0.000050 closs: 0.7504 (0.7677) grad_norm: 0.4082 (0.5832) time: 5.5694 data: 0.0001 max mem: 71357 -[04:06:44.156813] Epoch: [1] [420/6500] lr: 0.000050 closs: 0.7621 (0.7684) grad_norm: 0.3807 (0.5788) time: 5.5665 data: 0.0001 max mem: 71357 -[04:07:39.800804] Epoch: [1] [430/6500] lr: 0.000050 closs: 0.7435 (0.7668) grad_norm: 0.4047 (0.5767) time: 5.5678 data: 0.0001 max mem: 71357 -[04:08:35.582184] Epoch: [1] [440/6500] lr: 0.000050 closs: 0.7382 (0.7663) grad_norm: 0.4047 (0.5735) time: 5.5712 data: 0.0001 max mem: 71357 -[04:09:31.256070] Epoch: [1] [450/6500] lr: 0.000050 closs: 0.7414 (0.7663) grad_norm: 0.3852 (0.5718) time: 5.5727 data: 0.0001 max mem: 71357 -[04:10:26.919187] Epoch: [1] [460/6500] lr: 0.000050 closs: 0.7273 (0.7660) grad_norm: 0.4047 (0.5691) time: 5.5667 data: 0.0001 max mem: 71357 -[04:11:22.572352] Epoch: [1] [470/6500] lr: 0.000050 closs: 0.7546 (0.7670) grad_norm: 0.4096 (0.5692) time: 5.5657 data: 0.0001 max mem: 71357 -[04:12:18.379459] Epoch: [1] [480/6500] lr: 0.000050 closs: 0.7764 (0.7670) grad_norm: 0.4096 (0.5649) time: 5.5729 data: 0.0001 max mem: 71357 -[04:13:14.102252] Epoch: [1] [490/6500] lr: 0.000050 closs: 0.7591 (0.7662) grad_norm: 0.4096 (0.5617) time: 5.5764 data: 0.0001 max mem: 71357 -[04:14:09.753488] Epoch: [1] [500/6500] lr: 0.000050 closs: 0.7008 (0.7658) grad_norm: 0.4468 (0.5610) time: 5.5686 data: 0.0001 max mem: 71357 -[04:15:05.414976] Epoch: [1] [510/6500] lr: 0.000050 closs: 0.7568 (0.7650) grad_norm: 0.3889 (0.5583) time: 5.5655 data: 0.0001 max mem: 71357 -[04:16:01.166383] Epoch: [1] [520/6500] lr: 0.000050 closs: 0.7262 (0.7643) grad_norm: 0.4508 (0.5579) time: 5.5705 data: 0.0001 max mem: 71357 -[04:16:56.919801] Epoch: [1] [530/6500] lr: 0.000050 closs: 0.6799 (0.7621) grad_norm: 0.4514 (0.5565) time: 5.5752 data: 0.0001 max mem: 71357 -[04:17:52.660053] Epoch: [1] [540/6500] lr: 0.000050 closs: 0.6093 (0.7599) grad_norm: 0.4378 (0.5536) time: 5.5746 data: 0.0001 max mem: 71357 -[04:18:48.328201] Epoch: [1] [550/6500] lr: 0.000050 closs: 0.6526 (0.7593) grad_norm: 0.4489 (0.5519) time: 5.5704 data: 0.0002 max mem: 71357 -[04:19:44.116985] Epoch: [1] [560/6500] lr: 0.000050 closs: 0.7426 (0.7588) grad_norm: 0.3843 (0.5489) time: 5.5728 data: 0.0002 max mem: 71357 -[04:20:39.824567] Epoch: [1] [570/6500] lr: 0.000050 closs: 0.7211 (0.7596) grad_norm: 0.3797 (0.5466) time: 5.5747 data: 0.0001 max mem: 71357 -[04:21:35.492317] Epoch: [1] [580/6500] lr: 0.000050 closs: 0.7623 (0.7591) grad_norm: 0.3837 (0.5454) time: 5.5687 data: 0.0001 max mem: 71357 -[04:22:31.256691] Epoch: [1] [590/6500] lr: 0.000050 closs: 0.7601 (0.7591) grad_norm: 0.3874 (0.5442) time: 5.5715 data: 0.0001 max mem: 71357 -[04:23:27.024625] Epoch: [1] [600/6500] lr: 0.000050 closs: 0.7601 (0.7599) grad_norm: 0.4280 (0.5442) time: 5.5765 data: 0.0001 max mem: 71357 -[04:24:22.754492] Epoch: [1] [610/6500] lr: 0.000050 closs: 0.7729 (0.7613) grad_norm: 0.4293 (0.5421) time: 5.5748 data: 0.0001 max mem: 71357 -[04:25:18.641839] Epoch: [1] [620/6500] lr: 0.000050 closs: 0.7729 (0.7617) grad_norm: 0.4281 (0.5403) time: 5.5808 data: 0.0001 max mem: 71357 -[04:26:14.504595] Epoch: [1] [630/6500] lr: 0.000050 closs: 0.7718 (0.7620) grad_norm: 0.4043 (0.5377) time: 5.5874 data: 0.0001 max mem: 71357 -[04:27:10.222475] Epoch: [1] [640/6500] lr: 0.000050 closs: 0.7718 (0.7624) grad_norm: 0.4043 (0.5370) time: 5.5789 data: 0.0001 max mem: 71357 -[04:28:05.954345] Epoch: [1] [650/6500] lr: 0.000050 closs: 0.7636 (0.7626) grad_norm: 0.3995 (0.5348) time: 5.5724 data: 0.0001 max mem: 71357 -[04:29:01.871861] Epoch: [1] [660/6500] lr: 0.000050 closs: 0.7671 (0.7637) grad_norm: 0.3870 (0.5328) time: 5.5824 data: 0.0002 max mem: 71357 -[04:29:57.593541] Epoch: [1] [670/6500] lr: 0.000050 closs: 0.7871 (0.7652) grad_norm: 0.4053 (0.5317) time: 5.5819 data: 0.0002 max mem: 71357 -[04:30:53.271954] Epoch: [1] [680/6500] lr: 0.000050 closs: 0.7538 (0.7653) grad_norm: 0.3957 (0.5294) time: 5.5699 data: 0.0001 max mem: 71357 -[04:31:49.060636] Epoch: [1] [690/6500] lr: 0.000050 closs: 0.7395 (0.7658) grad_norm: 0.3957 (0.5279) time: 5.5733 data: 0.0001 max mem: 71357 -[04:32:44.827788] Epoch: [1] [700/6500] lr: 0.000050 closs: 0.7529 (0.7662) grad_norm: 0.3957 (0.5283) time: 5.5777 data: 0.0001 max mem: 71357 -[04:33:40.570340] Epoch: [1] [710/6500] lr: 0.000050 closs: 0.7456 (0.7660) grad_norm: 0.3900 (0.5295) time: 5.5754 data: 0.0002 max mem: 71357 -[04:34:36.343032] Epoch: [1] [720/6500] lr: 0.000050 closs: 0.7456 (0.7663) grad_norm: 0.3888 (0.5283) time: 5.5756 data: 0.0002 max mem: 71357 -[04:35:32.039216] Epoch: [1] [730/6500] lr: 0.000050 closs: 0.7721 (0.7665) grad_norm: 0.4204 (0.5287) time: 5.5733 data: 0.0001 max mem: 71357 -[04:36:27.753192] Epoch: [1] [740/6500] lr: 0.000050 closs: 0.7721 (0.7673) grad_norm: 0.4113 (0.5270) time: 5.5704 data: 0.0001 max mem: 71357 -[04:37:23.585540] Epoch: [1] [750/6500] lr: 0.000050 closs: 0.7719 (0.7677) grad_norm: 0.3883 (0.5252) time: 5.5772 data: 0.0001 max mem: 71357 -[04:38:19.280542] Epoch: [1] [760/6500] lr: 0.000050 closs: 0.7400 (0.7679) grad_norm: 0.3817 (0.5245) time: 5.5763 data: 0.0001 max mem: 71357 -[04:39:14.994009] Epoch: [1] [770/6500] lr: 0.000050 closs: 0.7093 (0.7682) grad_norm: 0.3817 (0.5237) time: 5.5703 data: 0.0001 max mem: 71357 -[04:40:10.694587] Epoch: [1] [780/6500] lr: 0.000050 closs: 0.7811 (0.7686) grad_norm: 0.3984 (0.5226) time: 5.5706 data: 0.0001 max mem: 71357 -[04:41:06.527247] Epoch: [1] [790/6500] lr: 0.000050 closs: 0.7501 (0.7679) grad_norm: 0.4039 (0.5212) time: 5.5766 data: 0.0001 max mem: 71357 -[04:42:02.287497] Epoch: [1] [800/6500] lr: 0.000050 closs: 0.7254 (0.7673) grad_norm: 0.4044 (0.5203) time: 5.5795 data: 0.0001 max mem: 71357 -[04:42:57.958012] Epoch: [1] [810/6500] lr: 0.000050 closs: 0.6886 (0.7670) grad_norm: 0.4039 (0.5206) time: 5.5714 data: 0.0001 max mem: 71357 -[04:43:53.709271] Epoch: [1] [820/6500] lr: 0.000050 closs: 0.6873 (0.7668) grad_norm: 0.3714 (0.5228) time: 5.5710 data: 0.0001 max mem: 71357 -[04:44:49.456416] Epoch: [1] [830/6500] lr: 0.000050 closs: 0.7426 (0.7667) grad_norm: 0.4044 (0.5218) time: 5.5748 data: 0.0001 max mem: 71357 -[04:45:45.250753] Epoch: [1] [840/6500] lr: 0.000050 closs: 0.7789 (0.7668) grad_norm: 0.4001 (0.5215) time: 5.5770 data: 0.0001 max mem: 71357 -[04:46:40.949223] Epoch: [1] [850/6500] lr: 0.000050 closs: 0.7471 (0.7662) grad_norm: 0.3780 (0.5200) time: 5.5746 data: 0.0001 max mem: 71357 -[04:47:36.561157] Epoch: [1] [860/6500] lr: 0.000050 closs: 0.7247 (0.7658) grad_norm: 0.4001 (0.5197) time: 5.5654 data: 0.0001 max mem: 71357 -[04:48:32.308653] Epoch: [1] [870/6500] lr: 0.000050 closs: 0.7587 (0.7662) grad_norm: 0.4001 (0.5201) time: 5.5679 data: 0.0002 max mem: 71357 -[04:49:28.149376] Epoch: [1] [880/6500] lr: 0.000050 closs: 0.7531 (0.7667) grad_norm: 0.3780 (0.5181) time: 5.5793 data: 0.0002 max mem: 71357 -[04:50:23.950718] Epoch: [1] [890/6500] lr: 0.000050 closs: 0.7416 (0.7664) grad_norm: 0.3905 (0.5193) time: 5.5820 data: 0.0001 max mem: 71357 -[04:51:19.670794] Epoch: [1] [900/6500] lr: 0.000050 closs: 0.7291 (0.7655) grad_norm: 0.3578 (0.5184) time: 5.5760 data: 0.0001 max mem: 71357 -[04:52:15.464157] Epoch: [1] [910/6500] lr: 0.000050 closs: 0.7453 (0.7650) grad_norm: 0.3669 (0.5174) time: 5.5756 data: 0.0001 max mem: 71357 -[04:53:11.330205] Epoch: [1] [920/6500] lr: 0.000050 closs: 0.7024 (0.7651) grad_norm: 0.3843 (0.5163) time: 5.5828 data: 0.0001 max mem: 71357 -[04:54:07.010433] Epoch: [1] [930/6500] lr: 0.000050 closs: 0.7381 (0.7650) grad_norm: 0.3768 (0.5146) time: 5.5772 data: 0.0002 max mem: 71357 -[04:55:02.643380] Epoch: [1] [940/6500] lr: 0.000050 closs: 0.7571 (0.7650) grad_norm: 0.3843 (0.5142) time: 5.5656 data: 0.0002 max mem: 71357 -[04:55:58.347304] Epoch: [1] [950/6500] lr: 0.000050 closs: 0.7475 (0.7644) grad_norm: 0.3946 (0.5133) time: 5.5668 data: 0.0001 max mem: 71357 -[04:56:54.139194] Epoch: [1] [960/6500] lr: 0.000050 closs: 0.7475 (0.7651) grad_norm: 0.3654 (0.5122) time: 5.5747 data: 0.0001 max mem: 71357 -[04:57:50.039666] Epoch: [1] [970/6500] lr: 0.000050 closs: 0.7587 (0.7649) grad_norm: 0.3891 (0.5117) time: 5.5845 data: 0.0001 max mem: 71357 -[04:58:45.709094] Epoch: [1] [980/6500] lr: 0.000050 closs: 0.7098 (0.7645) grad_norm: 0.3654 (0.5107) time: 5.5784 data: 0.0001 max mem: 71357 -[04:59:41.325212] Epoch: [1] [990/6500] lr: 0.000050 closs: 0.7216 (0.7645) grad_norm: 0.3654 (0.5096) time: 5.5642 data: 0.0001 max mem: 71357 -[05:00:36.996785] Epoch: [1] [1000/6500] lr: 0.000050 closs: 0.7500 (0.7646) grad_norm: 0.3880 (0.5092) time: 5.5643 data: 0.0001 max mem: 71357 -[05:01:32.817052] Epoch: [1] [1010/6500] lr: 0.000050 closs: 0.7316 (0.7644) grad_norm: 0.3880 (0.5102) time: 5.5745 data: 0.0001 max mem: 71357 -[05:02:28.530120] Epoch: [1] [1020/6500] lr: 0.000050 closs: 0.7286 (0.7641) grad_norm: 0.4530 (0.5110) time: 5.5766 data: 0.0001 max mem: 71357 -[05:03:24.118826] Epoch: [1] [1030/6500] lr: 0.000050 closs: 0.7875 (0.7641) grad_norm: 0.4651 (0.5107) time: 5.5650 data: 0.0001 max mem: 71357 -[05:04:19.728286] Epoch: [1] [1040/6500] lr: 0.000050 closs: 0.7858 (0.7640) grad_norm: 0.5125 (0.5102) time: 5.5598 data: 0.0001 max mem: 71357 -[05:05:15.444436] Epoch: [1] [1050/6500] lr: 0.000050 closs: 0.7511 (0.7631) grad_norm: 0.4293 (0.5093) time: 5.5662 data: 0.0001 max mem: 71357 -[05:06:11.258329] Epoch: [1] [1060/6500] lr: 0.000050 closs: 0.6617 (0.7621) grad_norm: 0.4393 (0.5098) time: 5.5764 data: 0.0001 max mem: 71357 -[05:07:06.968795] Epoch: [1] [1070/6500] lr: 0.000050 closs: 0.6714 (0.7617) grad_norm: 0.4393 (0.5090) time: 5.5761 data: 0.0001 max mem: 71357 -[05:08:02.608905] Epoch: [1] [1080/6500] lr: 0.000050 closs: 0.6980 (0.7620) grad_norm: 0.4393 (0.5091) time: 5.5675 data: 0.0001 max mem: 71357 -[05:08:58.333156] Epoch: [1] [1090/6500] lr: 0.000050 closs: 0.7979 (0.7625) grad_norm: 0.4444 (0.5083) time: 5.5681 data: 0.0002 max mem: 71357 -[05:09:54.158949] Epoch: [1] [1100/6500] lr: 0.000050 closs: 0.7641 (0.7622) grad_norm: 0.4184 (0.5078) time: 5.5774 data: 0.0002 max mem: 71357 -[05:10:49.934142] Epoch: [1] [1110/6500] lr: 0.000050 closs: 0.7720 (0.7626) grad_norm: 0.4444 (0.5076) time: 5.5800 data: 0.0001 max mem: 71357 -[05:11:45.606270] Epoch: [1] [1120/6500] lr: 0.000050 closs: 0.7850 (0.7630) grad_norm: 0.4184 (0.5068) time: 5.5723 data: 0.0001 max mem: 71357 -[05:12:41.469541] Epoch: [1] [1130/6500] lr: 0.000050 closs: 0.7606 (0.7631) grad_norm: 0.4232 (0.5069) time: 5.5767 data: 0.0001 max mem: 71357 -[05:13:37.226239] Epoch: [1] [1140/6500] lr: 0.000050 closs: 0.7306 (0.7627) grad_norm: 0.4617 (0.5069) time: 5.5809 data: 0.0002 max mem: 71357 -[05:14:32.916965] Epoch: [1] [1150/6500] lr: 0.000050 closs: 0.7464 (0.7632) grad_norm: 0.4232 (0.5061) time: 5.5723 data: 0.0002 max mem: 71357 -[05:15:28.589173] Epoch: [1] [1160/6500] lr: 0.000050 closs: 0.8070 (0.7637) grad_norm: 0.4654 (0.5071) time: 5.5680 data: 0.0001 max mem: 71357 -[05:16:24.252719] Epoch: [1] [1170/6500] lr: 0.000050 closs: 0.7713 (0.7635) grad_norm: 0.4364 (0.5061) time: 5.5667 data: 0.0001 max mem: 71357 -[05:17:19.928322] Epoch: [1] [1180/6500] lr: 0.000050 closs: 0.6577 (0.7623) grad_norm: 0.4311 (0.5059) time: 5.5669 data: 0.0001 max mem: 71357 -[05:18:15.678514] Epoch: [1] [1190/6500] lr: 0.000050 closs: 0.6944 (0.7620) grad_norm: 0.4312 (0.5049) time: 5.5712 data: 0.0001 max mem: 71357 -[05:19:11.287718] Epoch: [1] [1200/6500] lr: 0.000050 closs: 0.7023 (0.7611) grad_norm: 0.4168 (0.5048) time: 5.5679 data: 0.0001 max mem: 71357 -[05:20:06.965210] Epoch: [1] [1210/6500] lr: 0.000050 closs: 0.7447 (0.7617) grad_norm: 0.4043 (0.5037) time: 5.5642 data: 0.0001 max mem: 71357 -[05:21:02.647004] Epoch: [1] [1220/6500] lr: 0.000050 closs: 0.8533 (0.7631) grad_norm: 0.4043 (0.5091) time: 5.5678 data: 0.0001 max mem: 71357 -[05:21:58.423989] Epoch: [1] [1230/6500] lr: 0.000050 closs: 0.8409 (0.7629) grad_norm: 0.4043 (0.5136) time: 5.5728 data: 0.0001 max mem: 71357 -[05:22:54.043709] Epoch: [1] [1240/6500] lr: 0.000050 closs: 0.7011 (0.7626) grad_norm: 0.4162 (0.5134) time: 5.5698 data: 0.0001 max mem: 71357 -[05:23:49.814775] Epoch: [1] [1250/6500] lr: 0.000050 closs: 0.7408 (0.7626) grad_norm: 0.4162 (0.5124) time: 5.5695 data: 0.0001 max mem: 71357 -[05:24:45.428712] Epoch: [1] [1260/6500] lr: 0.000050 closs: 0.7448 (0.7624) grad_norm: 0.4091 (0.5116) time: 5.5691 data: 0.0001 max mem: 71357 -[05:25:41.203614] Epoch: [1] [1270/6500] lr: 0.000050 closs: 0.7227 (0.7624) grad_norm: 0.4091 (0.5118) time: 5.5693 data: 0.0001 max mem: 71357 -[05:26:37.014189] Epoch: [1] [1280/6500] lr: 0.000050 closs: 0.7570 (0.7625) grad_norm: 0.4033 (0.5109) time: 5.5792 data: 0.0001 max mem: 71357 -[05:27:32.726225] Epoch: [1] [1290/6500] lr: 0.000050 closs: 0.7570 (0.7623) grad_norm: 0.4306 (0.5116) time: 5.5760 data: 0.0001 max mem: 71357 -[05:28:28.581411] Epoch: [1] [1300/6500] lr: 0.000050 closs: 0.7812 (0.7627) grad_norm: 0.4184 (0.5103) time: 5.5783 data: 0.0001 max mem: 71357 -[05:29:24.296988] Epoch: [1] [1310/6500] lr: 0.000050 closs: 0.7503 (0.7624) grad_norm: 0.4029 (0.5096) time: 5.5785 data: 0.0001 max mem: 71357 -[05:30:20.071193] Epoch: [1] [1320/6500] lr: 0.000049 closs: 0.7246 (0.7624) grad_norm: 0.4029 (0.5091) time: 5.5744 data: 0.0001 max mem: 71357 -[05:31:15.692148] Epoch: [1] [1330/6500] lr: 0.000049 closs: 0.7118 (0.7625) grad_norm: 0.3548 (0.5083) time: 5.5697 data: 0.0001 max mem: 71357 -[05:32:11.408467] Epoch: [1] [1340/6500] lr: 0.000049 closs: 0.7109 (0.7621) grad_norm: 0.3788 (0.5074) time: 5.5667 data: 0.0001 max mem: 71357 -[05:33:07.091654] Epoch: [1] [1350/6500] lr: 0.000049 closs: 0.7109 (0.7621) grad_norm: 0.3984 (0.5070) time: 5.5699 data: 0.0001 max mem: 71357 -[05:34:02.874850] Epoch: [1] [1360/6500] lr: 0.000049 closs: 0.7131 (0.7629) grad_norm: 0.3984 (0.5061) time: 5.5732 data: 0.0001 max mem: 71357 -[05:34:58.543548] Epoch: [1] [1370/6500] lr: 0.000049 closs: 0.7400 (0.7626) grad_norm: 0.3952 (0.5052) time: 5.5725 data: 0.0001 max mem: 71357 -[05:35:54.208343] Epoch: [1] [1380/6500] lr: 0.000049 closs: 0.7499 (0.7630) grad_norm: 0.3952 (0.5043) time: 5.5666 data: 0.0001 max mem: 71357 -[05:36:49.991759] Epoch: [1] [1390/6500] lr: 0.000049 closs: 0.7824 (0.7628) grad_norm: 0.3701 (0.5033) time: 5.5723 data: 0.0001 max mem: 71357 -[05:37:45.788654] Epoch: [1] [1400/6500] lr: 0.000049 closs: 0.7824 (0.7628) grad_norm: 0.3727 (0.5032) time: 5.5789 data: 0.0001 max mem: 71357 -[05:38:41.516767] Epoch: [1] [1410/6500] lr: 0.000049 closs: 0.8001 (0.7629) grad_norm: 0.3895 (0.5029) time: 5.5761 data: 0.0001 max mem: 71357 -[05:39:37.272883] Epoch: [1] [1420/6500] lr: 0.000049 closs: 0.8019 (0.7634) grad_norm: 0.3895 (0.5022) time: 5.5741 data: 0.0001 max mem: 71357 -[05:40:32.946983] Epoch: [1] [1430/6500] lr: 0.000049 closs: 0.7728 (0.7634) grad_norm: 0.4048 (0.5015) time: 5.5714 data: 0.0001 max mem: 71357 -[05:41:28.638478] Epoch: [1] [1440/6500] lr: 0.000049 closs: 0.6953 (0.7633) grad_norm: 0.4000 (0.5010) time: 5.5682 data: 0.0001 max mem: 71357 -[05:42:24.408111] Epoch: [1] [1450/6500] lr: 0.000049 closs: 0.7377 (0.7630) grad_norm: 0.4000 (0.5005) time: 5.5729 data: 0.0001 max mem: 71357 -[05:43:20.139266] Epoch: [1] [1460/6500] lr: 0.000049 closs: 0.7676 (0.7636) grad_norm: 0.4100 (0.5000) time: 5.5749 data: 0.0001 max mem: 71357 -[05:44:15.852405] Epoch: [1] [1470/6500] lr: 0.000049 closs: 0.8084 (0.7637) grad_norm: 0.4047 (0.4994) time: 5.5721 data: 0.0001 max mem: 71357 -[05:45:11.499402] Epoch: [1] [1480/6500] lr: 0.000049 closs: 0.7897 (0.7633) grad_norm: 0.4047 (0.4993) time: 5.5679 data: 0.0001 max mem: 71357 -[05:46:07.251883] Epoch: [1] [1490/6500] lr: 0.000049 closs: 0.7280 (0.7636) grad_norm: 0.3998 (0.4990) time: 5.5699 data: 0.0001 max mem: 71357 -[05:47:03.095336] Epoch: [1] [1500/6500] lr: 0.000049 closs: 0.7501 (0.7636) grad_norm: 0.3807 (0.4985) time: 5.5797 data: 0.0001 max mem: 71357 -[05:47:58.748278] Epoch: [1] [1510/6500] lr: 0.000049 closs: 0.7772 (0.7638) grad_norm: 0.3998 (0.4986) time: 5.5747 data: 0.0001 max mem: 71357 -[05:48:54.481750] Epoch: [1] [1520/6500] lr: 0.000049 closs: 0.7816 (0.7640) grad_norm: 0.3807 (0.4983) time: 5.5692 data: 0.0002 max mem: 71357 -[05:49:50.180751] Epoch: [1] [1530/6500] lr: 0.000049 closs: 0.7623 (0.7641) grad_norm: 0.3790 (0.4979) time: 5.5715 data: 0.0002 max mem: 71357 -[05:50:45.971116] Epoch: [1] [1540/6500] lr: 0.000049 closs: 0.8359 (0.7645) grad_norm: 0.3935 (0.4972) time: 5.5743 data: 0.0001 max mem: 71357 -[05:51:41.648187] Epoch: [1] [1550/6500] lr: 0.000049 closs: 0.7772 (0.7649) grad_norm: 0.3974 (0.4970) time: 5.5733 data: 0.0001 max mem: 71357 -[05:52:37.329020] Epoch: [1] [1560/6500] lr: 0.000049 closs: 0.7288 (0.7646) grad_norm: 0.4314 (0.4972) time: 5.5678 data: 0.0001 max mem: 71357 -[05:53:33.021624] Epoch: [1] [1570/6500] lr: 0.000049 closs: 0.7566 (0.7647) grad_norm: 0.4203 (0.4978) time: 5.5686 data: 0.0001 max mem: 71357 -[05:54:28.849243] Epoch: [1] [1580/6500] lr: 0.000049 closs: 0.7969 (0.7649) grad_norm: 0.4503 (0.4972) time: 5.5759 data: 0.0001 max mem: 71357 -[05:55:24.622675] Epoch: [1] [1590/6500] lr: 0.000049 closs: 0.7610 (0.7645) grad_norm: 0.4014 (0.4963) time: 5.5800 data: 0.0001 max mem: 71357 -[05:56:20.321965] Epoch: [1] [1600/6500] lr: 0.000049 closs: 0.7300 (0.7644) grad_norm: 0.3717 (0.4956) time: 5.5735 data: 0.0001 max mem: 71357 -[05:57:16.028460] Epoch: [1] [1610/6500] lr: 0.000049 closs: 0.7300 (0.7642) grad_norm: 0.3510 (0.4955) time: 5.5702 data: 0.0001 max mem: 71357 -[05:58:11.783706] Epoch: [1] [1620/6500] lr: 0.000049 closs: 0.7731 (0.7643) grad_norm: 0.3510 (0.4947) time: 5.5730 data: 0.0001 max mem: 71357 -[05:59:07.570790] Epoch: [1] [1630/6500] lr: 0.000049 closs: 0.8139 (0.7647) grad_norm: 0.4063 (0.4945) time: 5.5770 data: 0.0001 max mem: 71357 -[06:00:03.238802] Epoch: [1] [1640/6500] lr: 0.000049 closs: 0.8139 (0.7646) grad_norm: 0.4153 (0.4940) time: 5.5727 data: 0.0001 max mem: 71357 -[06:00:58.977098] Epoch: [1] [1650/6500] lr: 0.000049 closs: 0.7872 (0.7648) grad_norm: 0.4063 (0.4936) time: 5.5702 data: 0.0001 max mem: 71357 -[06:01:54.788329] Epoch: [1] [1660/6500] lr: 0.000049 closs: 0.7872 (0.7650) grad_norm: 0.4311 (0.4933) time: 5.5774 data: 0.0001 max mem: 71357 -[06:02:50.593146] Epoch: [1] [1670/6500] lr: 0.000049 closs: 0.7356 (0.7648) grad_norm: 0.4235 (0.4928) time: 5.5807 data: 0.0001 max mem: 71357 -[06:03:46.216024] Epoch: [1] [1680/6500] lr: 0.000049 closs: 0.7429 (0.7648) grad_norm: 0.4235 (0.4927) time: 5.5713 data: 0.0002 max mem: 71357 -[06:04:41.979289] Epoch: [1] [1690/6500] lr: 0.000049 closs: 0.7711 (0.7647) grad_norm: 0.4235 (0.4929) time: 5.5692 data: 0.0002 max mem: 71357 -[06:05:37.689767] Epoch: [1] [1700/6500] lr: 0.000049 closs: 0.7552 (0.7647) grad_norm: 0.4142 (0.4924) time: 5.5736 data: 0.0001 max mem: 71357 -[06:06:33.493941] Epoch: [1] [1710/6500] lr: 0.000049 closs: 0.6966 (0.7640) grad_norm: 0.3993 (0.4918) time: 5.5756 data: 0.0001 max mem: 71357 -[06:07:29.339116] Epoch: [1] [1720/6500] lr: 0.000049 closs: 0.6799 (0.7640) grad_norm: 0.3993 (0.4916) time: 5.5824 data: 0.0001 max mem: 71357 -[06:08:25.102712] Epoch: [1] [1730/6500] lr: 0.000049 closs: 0.6988 (0.7640) grad_norm: 0.3986 (0.4910) time: 5.5803 data: 0.0001 max mem: 71357 -[06:09:20.832118] Epoch: [1] [1740/6500] lr: 0.000049 closs: 0.6702 (0.7637) grad_norm: 0.3993 (0.4907) time: 5.5746 data: 0.0001 max mem: 71357 -[06:10:16.553919] Epoch: [1] [1750/6500] lr: 0.000049 closs: 0.6695 (0.7633) grad_norm: 0.4026 (0.4905) time: 5.5725 data: 0.0001 max mem: 71357 -[06:11:12.343083] Epoch: [1] [1760/6500] lr: 0.000049 closs: 0.7046 (0.7632) grad_norm: 0.3957 (0.4898) time: 5.5755 data: 0.0001 max mem: 71357 -[06:12:08.075303] Epoch: [1] [1770/6500] lr: 0.000049 closs: 0.7100 (0.7634) grad_norm: 0.3782 (0.4889) time: 5.5760 data: 0.0001 max mem: 71357 -[06:13:03.741874] Epoch: [1] [1780/6500] lr: 0.000049 closs: 0.7462 (0.7635) grad_norm: 0.3682 (0.4882) time: 5.5698 data: 0.0001 max mem: 71357 -[06:13:59.467131] Epoch: [1] [1790/6500] lr: 0.000049 closs: 0.7743 (0.7633) grad_norm: 0.3682 (0.4882) time: 5.5695 data: 0.0002 max mem: 71357 -[06:14:55.302278] Epoch: [1] [1800/6500] lr: 0.000049 closs: 0.7203 (0.7629) grad_norm: 0.3880 (0.4885) time: 5.5779 data: 0.0002 max mem: 71357 -[06:15:51.032856] Epoch: [1] [1810/6500] lr: 0.000049 closs: 0.7254 (0.7630) grad_norm: 0.4342 (0.4885) time: 5.5782 data: 0.0001 max mem: 71357 -[06:16:46.808428] Epoch: [1] [1820/6500] lr: 0.000049 closs: 0.7463 (0.7628) grad_norm: 0.4342 (0.4879) time: 5.5752 data: 0.0001 max mem: 71357 -[06:17:42.566412] Epoch: [1] [1830/6500] lr: 0.000049 closs: 0.7301 (0.7628) grad_norm: 0.4255 (0.4880) time: 5.5766 data: 0.0001 max mem: 71357 -[06:18:38.312144] Epoch: [1] [1840/6500] lr: 0.000049 closs: 0.8045 (0.7632) grad_norm: 0.3756 (0.4880) time: 5.5751 data: 0.0001 max mem: 71357 -[06:19:34.202602] Epoch: [1] [1850/6500] lr: 0.000049 closs: 0.7885 (0.7632) grad_norm: 0.3538 (0.4873) time: 5.5817 data: 0.0001 max mem: 71357 -[06:20:29.987714] Epoch: [1] [1860/6500] lr: 0.000049 closs: 0.7457 (0.7629) grad_norm: 0.3515 (0.4866) time: 5.5837 data: 0.0001 max mem: 71357 -[06:21:25.707631] Epoch: [1] [1870/6500] lr: 0.000049 closs: 0.7457 (0.7628) grad_norm: 0.3717 (0.4872) time: 5.5752 data: 0.0001 max mem: 71357 -[06:22:21.508496] Epoch: [1] [1880/6500] lr: 0.000049 closs: 0.7003 (0.7626) grad_norm: 0.3717 (0.4877) time: 5.5759 data: 0.0001 max mem: 71357 -[06:23:17.323008] Epoch: [1] [1890/6500] lr: 0.000049 closs: 0.7003 (0.7624) grad_norm: 0.3850 (0.4873) time: 5.5806 data: 0.0001 max mem: 71357 -[06:24:13.089745] Epoch: [1] [1900/6500] lr: 0.000049 closs: 0.7211 (0.7625) grad_norm: 0.4018 (0.4866) time: 5.5790 data: 0.0001 max mem: 71357 -[06:25:08.735665] Epoch: [1] [1910/6500] lr: 0.000049 closs: 0.7094 (0.7621) grad_norm: 0.4018 (0.4866) time: 5.5705 data: 0.0001 max mem: 71357 -[06:26:04.431078] Epoch: [1] [1920/6500] lr: 0.000049 closs: 0.6812 (0.7620) grad_norm: 0.4018 (0.4865) time: 5.5669 data: 0.0001 max mem: 71357 -[06:27:00.205152] Epoch: [1] [1930/6500] lr: 0.000049 closs: 0.7006 (0.7619) grad_norm: 0.4018 (0.4860) time: 5.5734 data: 0.0001 max mem: 71357 -[06:27:56.086508] Epoch: [1] [1940/6500] lr: 0.000049 closs: 0.7977 (0.7623) grad_norm: 0.3875 (0.4854) time: 5.5827 data: 0.0001 max mem: 71357 -[06:28:51.724118] Epoch: [1] [1950/6500] lr: 0.000049 closs: 0.7787 (0.7624) grad_norm: 0.3773 (0.4852) time: 5.5759 data: 0.0001 max mem: 71357 -[06:29:47.487192] Epoch: [1] [1960/6500] lr: 0.000049 closs: 0.7331 (0.7623) grad_norm: 0.3773 (0.4848) time: 5.5699 data: 0.0001 max mem: 71357 -[06:30:43.261286] Epoch: [1] [1970/6500] lr: 0.000049 closs: 0.7788 (0.7628) grad_norm: 0.3829 (0.4843) time: 5.5768 data: 0.0001 max mem: 71357 -[06:31:38.991880] Epoch: [1] [1980/6500] lr: 0.000049 closs: 0.7698 (0.7625) grad_norm: 0.3889 (0.4838) time: 5.5751 data: 0.0001 max mem: 71357 -[06:32:34.722420] Epoch: [1] [1990/6500] lr: 0.000049 closs: 0.7517 (0.7626) grad_norm: 0.3977 (0.4837) time: 5.5730 data: 0.0001 max mem: 71357 -[06:33:30.426934] Epoch: [1] [2000/6500] lr: 0.000049 closs: 0.7535 (0.7625) grad_norm: 0.3977 (0.4837) time: 5.5717 data: 0.0001 max mem: 71357 -[06:34:26.157107] Epoch: [1] [2010/6500] lr: 0.000049 closs: 0.7385 (0.7624) grad_norm: 0.3737 (0.4831) time: 5.5717 data: 0.0001 max mem: 71357 -[06:35:21.974735] Epoch: [1] [2020/6500] lr: 0.000049 closs: 0.7578 (0.7625) grad_norm: 0.4042 (0.4834) time: 5.5773 data: 0.0001 max mem: 71357 -[06:36:17.629504] Epoch: [1] [2030/6500] lr: 0.000049 closs: 0.8125 (0.7628) grad_norm: 0.3830 (0.4834) time: 5.5735 data: 0.0001 max mem: 71357 -[06:37:13.410785] Epoch: [1] [2040/6500] lr: 0.000049 closs: 0.8006 (0.7627) grad_norm: 0.3633 (0.4827) time: 5.5717 data: 0.0001 max mem: 71357 -[06:38:09.181666] Epoch: [1] [2050/6500] lr: 0.000049 closs: 0.7954 (0.7626) grad_norm: 0.3844 (0.4823) time: 5.5775 data: 0.0001 max mem: 71357 -[06:39:04.900937] Epoch: [1] [2060/6500] lr: 0.000049 closs: 0.7756 (0.7627) grad_norm: 0.3844 (0.4822) time: 5.5744 data: 0.0001 max mem: 71357 -[06:40:00.725173] Epoch: [1] [2070/6500] lr: 0.000049 closs: 0.7406 (0.7625) grad_norm: 0.3707 (0.4816) time: 5.5771 data: 0.0001 max mem: 71357 -[06:40:56.513573] Epoch: [1] [2080/6500] lr: 0.000049 closs: 0.7405 (0.7626) grad_norm: 0.4113 (0.4810) time: 5.5806 data: 0.0001 max mem: 71357 -[06:41:52.319453] Epoch: [1] [2090/6500] lr: 0.000049 closs: 0.7236 (0.7623) grad_norm: 0.3818 (0.4805) time: 5.5796 data: 0.0001 max mem: 71357 -[06:42:48.076246] Epoch: [1] [2100/6500] lr: 0.000049 closs: 0.6560 (0.7622) grad_norm: 0.3610 (0.4800) time: 5.5781 data: 0.0001 max mem: 71357 -[06:43:43.875280] Epoch: [1] [2110/6500] lr: 0.000049 closs: 0.6950 (0.7623) grad_norm: 0.3698 (0.4796) time: 5.5777 data: 0.0001 max mem: 71357 -[06:44:39.638406] Epoch: [1] [2120/6500] lr: 0.000049 closs: 0.7491 (0.7621) grad_norm: 0.3982 (0.4794) time: 5.5780 data: 0.0001 max mem: 71357 -[06:45:35.343891] Epoch: [1] [2130/6500] lr: 0.000049 closs: 0.7413 (0.7621) grad_norm: 0.4028 (0.4792) time: 5.5733 data: 0.0001 max mem: 71357 -[06:46:31.017460] Epoch: [1] [2140/6500] lr: 0.000049 closs: 0.7413 (0.7618) grad_norm: 0.4288 (0.4796) time: 5.5688 data: 0.0001 max mem: 71357 -[06:47:26.831774] Epoch: [1] [2150/6500] lr: 0.000049 closs: 0.7122 (0.7617) grad_norm: 0.4262 (0.4791) time: 5.5743 data: 0.0001 max mem: 71357 -[06:48:22.566097] Epoch: [1] [2160/6500] lr: 0.000049 closs: 0.7195 (0.7616) grad_norm: 0.3987 (0.4787) time: 5.5774 data: 0.0001 max mem: 71357 -[06:49:18.229337] Epoch: [1] [2170/6500] lr: 0.000049 closs: 0.7297 (0.7615) grad_norm: 0.3768 (0.4784) time: 5.5698 data: 0.0001 max mem: 71357 -[06:50:13.979986] Epoch: [1] [2180/6500] lr: 0.000049 closs: 0.7356 (0.7615) grad_norm: 0.3603 (0.4779) time: 5.5706 data: 0.0001 max mem: 71357 -[06:51:09.697275] Epoch: [1] [2190/6500] lr: 0.000049 closs: 0.7507 (0.7619) grad_norm: 0.3768 (0.4775) time: 5.5733 data: 0.0001 max mem: 71357 -[06:52:05.550399] Epoch: [1] [2200/6500] lr: 0.000049 closs: 0.6988 (0.7616) grad_norm: 0.3747 (0.4779) time: 5.5784 data: 0.0001 max mem: 71357 -[06:53:01.293904] Epoch: [1] [2210/6500] lr: 0.000049 closs: 0.6853 (0.7614) grad_norm: 0.3885 (0.4777) time: 5.5797 data: 0.0001 max mem: 71357 -[06:53:56.969787] Epoch: [1] [2220/6500] lr: 0.000049 closs: 0.7125 (0.7612) grad_norm: 0.4290 (0.4780) time: 5.5709 data: 0.0002 max mem: 71357 -[06:54:52.673780] Epoch: [1] [2230/6500] lr: 0.000049 closs: 0.7342 (0.7614) grad_norm: 0.4588 (0.4781) time: 5.5689 data: 0.0002 max mem: 71357 -[06:55:48.376439] Epoch: [1] [2240/6500] lr: 0.000049 closs: 0.8235 (0.7617) grad_norm: 0.4588 (0.4780) time: 5.5702 data: 0.0001 max mem: 71357 -[06:56:44.026518] Epoch: [1] [2250/6500] lr: 0.000049 closs: 0.7678 (0.7616) grad_norm: 0.4054 (0.4783) time: 5.5675 data: 0.0001 max mem: 71357 -[06:57:39.613302] Epoch: [1] [2260/6500] lr: 0.000049 closs: 0.7678 (0.7619) grad_norm: 0.4079 (0.4790) time: 5.5618 data: 0.0001 max mem: 71357 -[06:58:35.230225] Epoch: [1] [2270/6500] lr: 0.000049 closs: 0.8237 (0.7621) grad_norm: 0.4522 (0.4794) time: 5.5601 data: 0.0001 max mem: 71357 -[06:59:30.944187] Epoch: [1] [2280/6500] lr: 0.000048 closs: 0.7225 (0.7620) grad_norm: 0.4522 (0.4798) time: 5.5664 data: 0.0001 max mem: 71357 -[07:00:26.770134] Epoch: [1] [2290/6500] lr: 0.000048 closs: 0.7159 (0.7618) grad_norm: 0.4822 (0.4797) time: 5.5769 data: 0.0001 max mem: 71357 -[07:01:22.490132] Epoch: [1] [2300/6500] lr: 0.000048 closs: 0.7169 (0.7616) grad_norm: 0.4143 (0.4794) time: 5.5772 data: 0.0001 max mem: 71357 -[07:02:18.117158] Epoch: [1] [2310/6500] lr: 0.000048 closs: 0.7632 (0.7617) grad_norm: 0.3907 (0.4794) time: 5.5673 data: 0.0001 max mem: 71357 -[07:03:13.846062] Epoch: [1] [2320/6500] lr: 0.000048 closs: 0.7486 (0.7616) grad_norm: 0.3819 (0.4791) time: 5.5677 data: 0.0001 max mem: 71357 -[07:04:09.641871] Epoch: [1] [2330/6500] lr: 0.000048 closs: 0.7391 (0.7615) grad_norm: 0.4143 (0.4791) time: 5.5762 data: 0.0001 max mem: 71357 -[07:05:05.451518] Epoch: [1] [2340/6500] lr: 0.000048 closs: 0.7884 (0.7618) grad_norm: 0.3943 (0.4788) time: 5.5802 data: 0.0001 max mem: 71357 -[07:06:01.130149] Epoch: [1] [2350/6500] lr: 0.000048 closs: 0.7346 (0.7614) grad_norm: 0.4021 (0.4786) time: 5.5743 data: 0.0001 max mem: 71357 -[07:06:56.845866] Epoch: [1] [2360/6500] lr: 0.000048 closs: 0.7227 (0.7618) grad_norm: 0.4021 (0.4786) time: 5.5696 data: 0.0001 max mem: 71357 -[07:07:52.563701] Epoch: [1] [2370/6500] lr: 0.000048 closs: 0.8105 (0.7620) grad_norm: 0.3804 (0.4782) time: 5.5716 data: 0.0001 max mem: 71357 -[07:08:48.395607] Epoch: [1] [2380/6500] lr: 0.000048 closs: 0.7941 (0.7621) grad_norm: 0.4021 (0.4783) time: 5.5774 data: 0.0001 max mem: 71357 -[07:09:44.149078] Epoch: [1] [2390/6500] lr: 0.000048 closs: 0.7885 (0.7619) grad_norm: 0.3905 (0.4779) time: 5.5792 data: 0.0001 max mem: 71357 -[07:10:39.820251] Epoch: [1] [2400/6500] lr: 0.000048 closs: 0.6213 (0.7615) grad_norm: 0.4045 (0.4780) time: 5.5712 data: 0.0001 max mem: 71357 -[07:11:35.486229] Epoch: [1] [2410/6500] lr: 0.000048 closs: 0.6479 (0.7614) grad_norm: 0.3905 (0.4782) time: 5.5668 data: 0.0001 max mem: 71357 -[07:12:31.320418] Epoch: [1] [2420/6500] lr: 0.000048 closs: 0.7154 (0.7611) grad_norm: 0.3799 (0.4777) time: 5.5749 data: 0.0001 max mem: 71357 -[07:13:27.010058] Epoch: [1] [2430/6500] lr: 0.000048 closs: 0.7631 (0.7612) grad_norm: 0.3917 (0.4774) time: 5.5761 data: 0.0001 max mem: 71357 -[07:14:22.706755] Epoch: [1] [2440/6500] lr: 0.000048 closs: 0.7958 (0.7615) grad_norm: 0.3609 (0.4771) time: 5.5692 data: 0.0001 max mem: 71357 -[07:15:18.472441] Epoch: [1] [2450/6500] lr: 0.000048 closs: 0.7683 (0.7613) grad_norm: 0.3452 (0.4766) time: 5.5730 data: 0.0001 max mem: 71357 -[07:16:14.358302] Epoch: [1] [2460/6500] lr: 0.000048 closs: 0.7683 (0.7612) grad_norm: 0.3917 (0.4763) time: 5.5825 data: 0.0001 max mem: 71357 -[07:17:09.997614] Epoch: [1] [2470/6500] lr: 0.000048 closs: 0.8213 (0.7615) grad_norm: 0.3609 (0.4760) time: 5.5761 data: 0.0001 max mem: 71357 -[07:18:05.674747] Epoch: [1] [2480/6500] lr: 0.000048 closs: 0.7941 (0.7614) grad_norm: 0.4185 (0.4761) time: 5.5657 data: 0.0001 max mem: 71357 -[07:19:01.392136] Epoch: [1] [2490/6500] lr: 0.000048 closs: 0.7147 (0.7614) grad_norm: 0.4099 (0.4756) time: 5.5696 data: 0.0002 max mem: 71357 -[07:19:57.106876] Epoch: [1] [2500/6500] lr: 0.000048 closs: 0.7326 (0.7611) grad_norm: 0.3720 (0.4753) time: 5.5715 data: 0.0002 max mem: 71357 -[07:20:52.874861] Epoch: [1] [2510/6500] lr: 0.000048 closs: 0.6783 (0.7609) grad_norm: 0.3720 (0.4748) time: 5.5740 data: 0.0001 max mem: 71357 -[07:21:48.643566] Epoch: [1] [2520/6500] lr: 0.000048 closs: 0.6970 (0.7610) grad_norm: 0.3627 (0.4746) time: 5.5767 data: 0.0001 max mem: 71357 -[07:22:44.328112] Epoch: [1] [2530/6500] lr: 0.000048 closs: 0.7447 (0.7608) grad_norm: 0.3790 (0.4745) time: 5.5726 data: 0.0001 max mem: 71357 -[07:23:40.104620] Epoch: [1] [2540/6500] lr: 0.000048 closs: 0.7475 (0.7607) grad_norm: 0.3792 (0.4743) time: 5.5730 data: 0.0001 max mem: 71357 -[07:24:35.986148] Epoch: [1] [2550/6500] lr: 0.000048 closs: 0.8081 (0.7609) grad_norm: 0.3877 (0.4740) time: 5.5828 data: 0.0001 max mem: 71357 -[07:25:31.762648] Epoch: [1] [2560/6500] lr: 0.000048 closs: 0.8194 (0.7610) grad_norm: 0.4054 (0.4738) time: 5.5828 data: 0.0001 max mem: 71357 -[07:26:27.517291] Epoch: [1] [2570/6500] lr: 0.000048 closs: 0.8143 (0.7612) grad_norm: 0.4320 (0.4736) time: 5.5765 data: 0.0001 max mem: 71357 -[07:27:23.081161] Epoch: [1] [2580/6500] lr: 0.000048 closs: 0.8143 (0.7614) grad_norm: 0.4400 (0.4736) time: 5.5658 data: 0.0001 max mem: 71357 -[07:28:18.847361] Epoch: [1] [2590/6500] lr: 0.000048 closs: 0.7732 (0.7615) grad_norm: 0.4291 (0.4731) time: 5.5664 data: 0.0001 max mem: 71357 -[07:29:14.631754] Epoch: [1] [2600/6500] lr: 0.000048 closs: 0.7687 (0.7615) grad_norm: 0.4291 (0.4730) time: 5.5774 data: 0.0001 max mem: 71357 -[07:30:10.313627] Epoch: [1] [2610/6500] lr: 0.000048 closs: 0.7136 (0.7613) grad_norm: 0.4208 (0.4730) time: 5.5732 data: 0.0001 max mem: 71357 -[07:31:06.069722] Epoch: [1] [2620/6500] lr: 0.000048 closs: 0.7577 (0.7615) grad_norm: 0.3758 (0.4727) time: 5.5718 data: 0.0001 max mem: 71357 -[07:32:01.904851] Epoch: [1] [2630/6500] lr: 0.000048 closs: 0.7331 (0.7612) grad_norm: 0.3758 (0.4724) time: 5.5795 data: 0.0001 max mem: 71357 -[07:32:57.687644] Epoch: [1] [2640/6500] lr: 0.000048 closs: 0.7558 (0.7615) grad_norm: 0.3758 (0.4724) time: 5.5808 data: 0.0001 max mem: 71357 -[07:33:53.438636] Epoch: [1] [2650/6500] lr: 0.000048 closs: 0.7639 (0.7615) grad_norm: 0.3693 (0.4720) time: 5.5765 data: 0.0001 max mem: 71357 -[07:34:49.158394] Epoch: [1] [2660/6500] lr: 0.000048 closs: 0.7320 (0.7611) grad_norm: 0.3779 (0.4722) time: 5.5734 data: 0.0001 max mem: 71357 -[07:35:44.955768] Epoch: [1] [2670/6500] lr: 0.000048 closs: 0.7320 (0.7612) grad_norm: 0.3779 (0.4717) time: 5.5758 data: 0.0001 max mem: 71357 -[07:36:40.729888] Epoch: [1] [2680/6500] lr: 0.000048 closs: 0.7680 (0.7613) grad_norm: 0.3804 (0.4715) time: 5.5785 data: 0.0001 max mem: 71357 -[07:37:36.512085] Epoch: [1] [2690/6500] lr: 0.000048 closs: 0.7978 (0.7615) grad_norm: 0.4166 (0.4714) time: 5.5777 data: 0.0001 max mem: 71357 -[07:38:32.202375] Epoch: [1] [2700/6500] lr: 0.000048 closs: 0.7734 (0.7615) grad_norm: 0.3963 (0.4712) time: 5.5735 data: 0.0001 max mem: 71357 -[07:39:27.865502] Epoch: [1] [2710/6500] lr: 0.000048 closs: 0.7563 (0.7617) grad_norm: 0.4106 (0.4714) time: 5.5675 data: 0.0002 max mem: 71357 -[07:40:23.614087] Epoch: [1] [2720/6500] lr: 0.000048 closs: 0.8125 (0.7619) grad_norm: 0.3954 (0.4711) time: 5.5705 data: 0.0002 max mem: 71357 -[07:41:19.483972] Epoch: [1] [2730/6500] lr: 0.000048 closs: 0.8166 (0.7619) grad_norm: 0.3848 (0.4707) time: 5.5808 data: 0.0001 max mem: 71357 -[07:42:15.165152] Epoch: [1] [2740/6500] lr: 0.000048 closs: 0.7304 (0.7619) grad_norm: 0.3909 (0.4707) time: 5.5775 data: 0.0001 max mem: 71357 -[07:43:10.903891] Epoch: [1] [2750/6500] lr: 0.000048 closs: 0.7304 (0.7617) grad_norm: 0.3943 (0.4704) time: 5.5709 data: 0.0001 max mem: 71357 -[07:44:06.612623] Epoch: [1] [2760/6500] lr: 0.000048 closs: 0.6582 (0.7613) grad_norm: 0.3752 (0.4701) time: 5.5723 data: 0.0002 max mem: 71357 -[07:45:02.480213] Epoch: [1] [2770/6500] lr: 0.000048 closs: 0.6582 (0.7612) grad_norm: 0.3752 (0.4696) time: 5.5787 data: 0.0002 max mem: 71357 -[07:45:58.196941] Epoch: [1] [2780/6500] lr: 0.000048 closs: 0.7057 (0.7612) grad_norm: 0.3494 (0.4695) time: 5.5791 data: 0.0001 max mem: 71357 -[07:46:53.877620] Epoch: [1] [2790/6500] lr: 0.000048 closs: 0.7393 (0.7613) grad_norm: 0.3485 (0.4697) time: 5.5698 data: 0.0001 max mem: 71357 -[07:47:49.527855] Epoch: [1] [2800/6500] lr: 0.000048 closs: 0.7912 (0.7615) grad_norm: 0.3532 (0.4696) time: 5.5664 data: 0.0001 max mem: 71357 -[07:48:45.322368] Epoch: [1] [2810/6500] lr: 0.000048 closs: 0.7889 (0.7616) grad_norm: 0.3971 (0.4696) time: 5.5721 data: 0.0001 max mem: 71357 -[07:49:41.244305] Epoch: [1] [2820/6500] lr: 0.000048 closs: 0.7473 (0.7618) grad_norm: 0.4032 (0.4694) time: 5.5857 data: 0.0002 max mem: 71357 -[07:50:37.037384] Epoch: [1] [2830/6500] lr: 0.000048 closs: 0.7493 (0.7616) grad_norm: 0.3973 (0.4690) time: 5.5857 data: 0.0002 max mem: 71357 -[07:51:32.889723] Epoch: [1] [2840/6500] lr: 0.000048 closs: 0.7518 (0.7616) grad_norm: 0.3720 (0.4686) time: 5.5822 data: 0.0001 max mem: 71357 -[07:52:28.686394] Epoch: [1] [2850/6500] lr: 0.000048 closs: 0.7385 (0.7615) grad_norm: 0.3720 (0.4684) time: 5.5823 data: 0.0001 max mem: 71357 -[07:53:24.563749] Epoch: [1] [2860/6500] lr: 0.000048 closs: 0.7385 (0.7615) grad_norm: 0.3721 (0.4692) time: 5.5836 data: 0.0001 max mem: 71357 -[07:54:20.254650] Epoch: [1] [2870/6500] lr: 0.000048 closs: 0.7785 (0.7615) grad_norm: 0.3818 (0.4690) time: 5.5783 data: 0.0002 max mem: 71357 -[07:55:15.940546] Epoch: [1] [2880/6500] lr: 0.000048 closs: 0.7861 (0.7616) grad_norm: 0.3890 (0.4688) time: 5.5687 data: 0.0002 max mem: 71357 -[07:56:11.671960] Epoch: [1] [2890/6500] lr: 0.000048 closs: 0.7440 (0.7616) grad_norm: 0.4069 (0.4688) time: 5.5708 data: 0.0001 max mem: 71357 -[07:57:07.459023] Epoch: [1] [2900/6500] lr: 0.000048 closs: 0.7440 (0.7617) grad_norm: 0.3818 (0.4685) time: 5.5759 data: 0.0001 max mem: 71357 -[07:58:03.203864] Epoch: [1] [2910/6500] lr: 0.000048 closs: 0.7739 (0.7616) grad_norm: 0.3697 (0.4684) time: 5.5765 data: 0.0001 max mem: 71357 -[07:58:58.923850] Epoch: [1] [2920/6500] lr: 0.000048 closs: 0.7745 (0.7616) grad_norm: 0.3614 (0.4680) time: 5.5731 data: 0.0001 max mem: 71357 -[07:59:54.665057] Epoch: [1] [2930/6500] lr: 0.000048 closs: 0.7605 (0.7615) grad_norm: 0.3855 (0.4680) time: 5.5729 data: 0.0001 max mem: 71357 -[08:00:50.398913] Epoch: [1] [2940/6500] lr: 0.000048 closs: 0.6680 (0.7614) grad_norm: 0.3870 (0.4680) time: 5.5737 data: 0.0001 max mem: 71357 -[08:01:46.151660] Epoch: [1] [2950/6500] lr: 0.000048 closs: 0.6983 (0.7613) grad_norm: 0.4192 (0.4678) time: 5.5743 data: 0.0001 max mem: 71357 -[08:02:41.874089] Epoch: [1] [2960/6500] lr: 0.000047 closs: 0.7362 (0.7612) grad_norm: 0.4303 (0.4679) time: 5.5737 data: 0.0001 max mem: 71357 -[08:03:37.531424] Epoch: [1] [2970/6500] lr: 0.000047 closs: 0.7607 (0.7614) grad_norm: 0.4208 (0.4677) time: 5.5689 data: 0.0001 max mem: 71357 -[08:04:33.308851] Epoch: [1] [2980/6500] lr: 0.000047 closs: 0.8016 (0.7617) grad_norm: 0.4293 (0.4677) time: 5.5716 data: 0.0002 max mem: 71357 -[08:05:29.081119] Epoch: [1] [2990/6500] lr: 0.000047 closs: 0.8433 (0.7620) grad_norm: 0.4644 (0.4678) time: 5.5774 data: 0.0002 max mem: 71357 -[08:06:24.770784] Epoch: [1] [3000/6500] lr: 0.000047 closs: 0.8140 (0.7621) grad_norm: 0.4623 (0.4678) time: 5.5730 data: 0.0001 max mem: 71357 -[08:07:20.456994] Epoch: [1] [3010/6500] lr: 0.000047 closs: 0.7919 (0.7624) grad_norm: 0.4056 (0.4676) time: 5.5687 data: 0.0001 max mem: 71357 -[08:08:16.206729] Epoch: [1] [3020/6500] lr: 0.000047 closs: 0.7767 (0.7623) grad_norm: 0.3780 (0.4676) time: 5.5717 data: 0.0001 max mem: 71357 -[08:09:11.944448] Epoch: [1] [3030/6500] lr: 0.000047 closs: 0.7767 (0.7623) grad_norm: 0.3780 (0.4674) time: 5.5743 data: 0.0002 max mem: 71357 -[08:10:07.833770] Epoch: [1] [3040/6500] lr: 0.000047 closs: 0.7765 (0.7625) grad_norm: 0.3748 (0.4672) time: 5.5812 data: 0.0002 max mem: 71357 -[08:11:03.571764] Epoch: [1] [3050/6500] lr: 0.000047 closs: 0.7737 (0.7626) grad_norm: 0.4118 (0.4674) time: 5.5812 data: 0.0001 max mem: 71357 -[08:11:59.228717] Epoch: [1] [3060/6500] lr: 0.000047 closs: 0.7259 (0.7627) grad_norm: 0.4142 (0.4672) time: 5.5697 data: 0.0001 max mem: 71357 -[08:12:54.997556] Epoch: [1] [3070/6500] lr: 0.000047 closs: 0.7579 (0.7627) grad_norm: 0.4118 (0.4670) time: 5.5712 data: 0.0001 max mem: 71357 -[08:13:50.769938] Epoch: [1] [3080/6500] lr: 0.000047 closs: 0.7579 (0.7629) grad_norm: 0.4118 (0.4668) time: 5.5770 data: 0.0001 max mem: 71357 -[08:14:46.551482] Epoch: [1] [3090/6500] lr: 0.000047 closs: 0.8032 (0.7630) grad_norm: 0.3768 (0.4667) time: 5.5776 data: 0.0001 max mem: 71357 -[08:15:42.253189] Epoch: [1] [3100/6500] lr: 0.000047 closs: 0.7503 (0.7631) grad_norm: 0.3593 (0.4665) time: 5.5740 data: 0.0001 max mem: 71357 -[08:16:37.935608] Epoch: [1] [3110/6500] lr: 0.000047 closs: 0.7480 (0.7632) grad_norm: 0.3968 (0.4666) time: 5.5691 data: 0.0001 max mem: 71357 -[08:17:33.663599] Epoch: [1] [3120/6500] lr: 0.000047 closs: 0.7927 (0.7635) grad_norm: 0.4407 (0.4668) time: 5.5704 data: 0.0001 max mem: 71357 -[08:18:29.345056] Epoch: [1] [3130/6500] lr: 0.000047 closs: 0.6941 (0.7632) grad_norm: 0.3968 (0.4668) time: 5.5704 data: 0.0001 max mem: 71357 -[08:19:24.975538] Epoch: [1] [3140/6500] lr: 0.000047 closs: 0.6547 (0.7629) grad_norm: 0.4024 (0.4667) time: 5.5655 data: 0.0002 max mem: 71357 -[08:20:20.672689] Epoch: [1] [3150/6500] lr: 0.000047 closs: 0.7241 (0.7629) grad_norm: 0.4054 (0.4671) time: 5.5663 data: 0.0002 max mem: 71357 -[08:21:16.505928] Epoch: [1] [3160/6500] lr: 0.000047 closs: 0.7440 (0.7630) grad_norm: 0.3850 (0.4667) time: 5.5764 data: 0.0001 max mem: 71357 -[08:22:12.375786] Epoch: [1] [3170/6500] lr: 0.000047 closs: 0.8047 (0.7632) grad_norm: 0.4024 (0.4664) time: 5.5850 data: 0.0001 max mem: 71357 -[08:23:08.092208] Epoch: [1] [3180/6500] lr: 0.000047 closs: 0.7895 (0.7631) grad_norm: 0.4011 (0.4664) time: 5.5792 data: 0.0001 max mem: 71357 -[08:24:03.827649] Epoch: [1] [3190/6500] lr: 0.000047 closs: 0.7448 (0.7630) grad_norm: 0.4006 (0.4662) time: 5.5725 data: 0.0001 max mem: 71357 -[08:24:59.532202] Epoch: [1] [3200/6500] lr: 0.000047 closs: 0.7496 (0.7631) grad_norm: 0.4011 (0.4661) time: 5.5719 data: 0.0001 max mem: 71357 -[08:25:55.397211] Epoch: [1] [3210/6500] lr: 0.000047 closs: 0.7620 (0.7631) grad_norm: 0.4011 (0.4659) time: 5.5784 data: 0.0001 max mem: 71357 -[08:26:51.117745] Epoch: [1] [3220/6500] lr: 0.000047 closs: 0.7429 (0.7630) grad_norm: 0.4174 (0.4658) time: 5.5791 data: 0.0001 max mem: 71357 -[08:27:46.826662] Epoch: [1] [3230/6500] lr: 0.000047 closs: 0.6742 (0.7628) grad_norm: 0.4086 (0.4658) time: 5.5713 data: 0.0001 max mem: 71357 -[08:28:42.685834] Epoch: [1] [3240/6500] lr: 0.000047 closs: 0.6707 (0.7625) grad_norm: 0.4086 (0.4656) time: 5.5783 data: 0.0001 max mem: 71357 -[08:29:38.511456] Epoch: [1] [3250/6500] lr: 0.000047 closs: 0.6844 (0.7624) grad_norm: 0.3959 (0.4653) time: 5.5842 data: 0.0001 max mem: 71357 -[08:30:34.409341] Epoch: [1] [3260/6500] lr: 0.000047 closs: 0.7046 (0.7623) grad_norm: 0.3587 (0.4650) time: 5.5861 data: 0.0001 max mem: 71357 -[08:31:30.133837] Epoch: [1] [3270/6500] lr: 0.000047 closs: 0.7094 (0.7624) grad_norm: 0.3697 (0.4648) time: 5.5810 data: 0.0001 max mem: 71357 -[08:32:25.764744] Epoch: [1] [3280/6500] lr: 0.000047 closs: 0.8080 (0.7624) grad_norm: 0.3697 (0.4645) time: 5.5677 data: 0.0001 max mem: 71357 -[08:33:21.575850] Epoch: [1] [3290/6500] lr: 0.000047 closs: 0.7767 (0.7625) grad_norm: 0.3703 (0.4643) time: 5.5720 data: 0.0001 max mem: 71357 -[08:34:17.542015] Epoch: [1] [3300/6500] lr: 0.000047 closs: 0.7996 (0.7626) grad_norm: 0.3703 (0.4639) time: 5.5888 data: 0.0001 max mem: 71357 -[08:35:13.335207] Epoch: [1] [3310/6500] lr: 0.000047 closs: 0.8321 (0.7627) grad_norm: 0.3927 (0.4637) time: 5.5879 data: 0.0001 max mem: 71357 -[08:36:09.139010] Epoch: [1] [3320/6500] lr: 0.000047 closs: 0.8484 (0.7629) grad_norm: 0.3927 (0.4638) time: 5.5797 data: 0.0001 max mem: 71357 -[08:37:04.835975] Epoch: [1] [3330/6500] lr: 0.000047 closs: 0.7901 (0.7628) grad_norm: 0.4108 (0.4642) time: 5.5749 data: 0.0001 max mem: 71357 -[08:38:00.607556] Epoch: [1] [3340/6500] lr: 0.000047 closs: 0.7757 (0.7629) grad_norm: 0.4291 (0.4640) time: 5.5733 data: 0.0001 max mem: 71357 -[08:38:56.311132] Epoch: [1] [3350/6500] lr: 0.000047 closs: 0.7757 (0.7630) grad_norm: 0.4346 (0.4640) time: 5.5737 data: 0.0001 max mem: 71357 -[08:39:52.026895] Epoch: [1] [3360/6500] lr: 0.000047 closs: 0.7453 (0.7628) grad_norm: 0.4087 (0.4638) time: 5.5709 data: 0.0001 max mem: 71357 -[08:40:47.717482] Epoch: [1] [3370/6500] lr: 0.000047 closs: 0.7354 (0.7627) grad_norm: 0.3643 (0.4664) time: 5.5702 data: 0.0001 max mem: 71357 -[08:41:43.520242] Epoch: [1] [3380/6500] lr: 0.000047 closs: 0.7594 (0.7628) grad_norm: 0.4010 (0.4665) time: 5.5745 data: 0.0001 max mem: 71357 -[08:42:39.268585] Epoch: [1] [3390/6500] lr: 0.000047 closs: 0.7647 (0.7628) grad_norm: 0.4010 (0.4664) time: 5.5775 data: 0.0001 max mem: 71357 -[08:43:35.001439] Epoch: [1] [3400/6500] lr: 0.000047 closs: 0.7647 (0.7628) grad_norm: 0.3704 (0.4664) time: 5.5740 data: 0.0001 max mem: 71357 -[08:44:30.743501] Epoch: [1] [3410/6500] lr: 0.000047 closs: 0.7684 (0.7630) grad_norm: 0.3704 (0.4662) time: 5.5737 data: 0.0002 max mem: 71357 -[08:45:26.393462] Epoch: [1] [3420/6500] lr: 0.000047 closs: 0.7215 (0.7627) grad_norm: 0.3796 (0.4673) time: 5.5695 data: 0.0002 max mem: 71357 -[08:46:22.210157] Epoch: [1] [3430/6500] lr: 0.000047 closs: 0.6823 (0.7628) grad_norm: 0.4108 (0.4672) time: 5.5732 data: 0.0001 max mem: 71357 -[08:47:17.932445] Epoch: [1] [3440/6500] lr: 0.000047 closs: 0.7888 (0.7629) grad_norm: 0.4108 (0.4673) time: 5.5768 data: 0.0001 max mem: 71357 -[08:48:13.684592] Epoch: [1] [3450/6500] lr: 0.000047 closs: 0.7483 (0.7628) grad_norm: 0.4411 (0.4673) time: 5.5736 data: 0.0001 max mem: 71357 -[08:49:09.395292] Epoch: [1] [3460/6500] lr: 0.000047 closs: 0.7008 (0.7626) grad_norm: 0.4411 (0.4672) time: 5.5731 data: 0.0001 max mem: 71357 -[08:50:05.184879] Epoch: [1] [3470/6500] lr: 0.000047 closs: 0.6853 (0.7625) grad_norm: 0.4585 (0.4672) time: 5.5749 data: 0.0001 max mem: 71357 -[08:51:01.025092] Epoch: [1] [3480/6500] lr: 0.000047 closs: 0.7112 (0.7624) grad_norm: 0.4481 (0.4671) time: 5.5814 data: 0.0001 max mem: 71357 -[08:51:56.784945] Epoch: [1] [3490/6500] lr: 0.000047 closs: 0.7441 (0.7625) grad_norm: 0.4319 (0.4668) time: 5.5799 data: 0.0001 max mem: 71357 -[08:52:52.516000] Epoch: [1] [3500/6500] lr: 0.000047 closs: 0.7127 (0.7623) grad_norm: 0.4192 (0.4667) time: 5.5744 data: 0.0001 max mem: 71357 -[08:53:48.323634] Epoch: [1] [3510/6500] lr: 0.000046 closs: 0.6960 (0.7622) grad_norm: 0.3713 (0.4664) time: 5.5768 data: 0.0001 max mem: 71357 -[08:54:44.172207] Epoch: [1] [3520/6500] lr: 0.000046 closs: 0.7526 (0.7620) grad_norm: 0.3674 (0.4662) time: 5.5827 data: 0.0001 max mem: 71357 -[08:55:39.982733] Epoch: [1] [3530/6500] lr: 0.000046 closs: 0.7364 (0.7620) grad_norm: 0.3693 (0.4663) time: 5.5829 data: 0.0001 max mem: 71357 -[08:56:35.769447] Epoch: [1] [3540/6500] lr: 0.000046 closs: 0.7020 (0.7619) grad_norm: 0.3710 (0.4665) time: 5.5798 data: 0.0001 max mem: 71357 -[08:57:31.531558] Epoch: [1] [3550/6500] lr: 0.000046 closs: 0.7080 (0.7618) grad_norm: 0.3710 (0.4661) time: 5.5773 data: 0.0001 max mem: 71357 -[08:58:27.330798] Epoch: [1] [3560/6500] lr: 0.000046 closs: 0.7660 (0.7619) grad_norm: 0.4092 (0.4664) time: 5.5780 data: 0.0001 max mem: 71357 -[08:59:23.008683] Epoch: [1] [3570/6500] lr: 0.000046 closs: 0.7951 (0.7619) grad_norm: 0.4092 (0.4664) time: 5.5738 data: 0.0002 max mem: 71357 -[09:00:18.802643] Epoch: [1] [3580/6500] lr: 0.000046 closs: 0.8291 (0.7622) grad_norm: 0.4014 (0.4662) time: 5.5735 data: 0.0002 max mem: 71357 -[09:01:14.550611] Epoch: [1] [3590/6500] lr: 0.000046 closs: 0.8291 (0.7623) grad_norm: 0.4089 (0.4660) time: 5.5770 data: 0.0001 max mem: 71357 -[09:02:10.271773] Epoch: [1] [3600/6500] lr: 0.000046 closs: 0.7837 (0.7624) grad_norm: 0.4011 (0.4662) time: 5.5734 data: 0.0001 max mem: 71357 -[09:03:06.083973] Epoch: [1] [3610/6500] lr: 0.000046 closs: 0.7482 (0.7622) grad_norm: 0.3935 (0.4660) time: 5.5765 data: 0.0001 max mem: 71357 -[09:04:01.836078] Epoch: [1] [3620/6500] lr: 0.000046 closs: 0.6927 (0.7623) grad_norm: 0.3726 (0.4658) time: 5.5781 data: 0.0001 max mem: 71357 -[09:04:57.478678] Epoch: [1] [3630/6500] lr: 0.000046 closs: 0.6874 (0.7621) grad_norm: 0.3935 (0.4657) time: 5.5697 data: 0.0001 max mem: 71357 -[09:05:53.278864] Epoch: [1] [3640/6500] lr: 0.000046 closs: 0.7235 (0.7621) grad_norm: 0.3875 (0.4655) time: 5.5721 data: 0.0001 max mem: 71357 -[09:06:48.995421] Epoch: [1] [3650/6500] lr: 0.000046 closs: 0.7066 (0.7619) grad_norm: 0.3875 (0.4654) time: 5.5757 data: 0.0001 max mem: 71357 -[09:07:44.777569] Epoch: [1] [3660/6500] lr: 0.000046 closs: 0.6981 (0.7619) grad_norm: 0.3875 (0.4651) time: 5.5748 data: 0.0001 max mem: 71357 -[09:08:40.434566] Epoch: [1] [3670/6500] lr: 0.000046 closs: 0.7989 (0.7620) grad_norm: 0.3847 (0.4649) time: 5.5718 data: 0.0001 max mem: 71357 -[09:09:36.095219] Epoch: [1] [3680/6500] lr: 0.000046 closs: 0.7827 (0.7620) grad_norm: 0.3847 (0.4649) time: 5.5658 data: 0.0002 max mem: 71357 -[09:10:31.882262] Epoch: [1] [3690/6500] lr: 0.000046 closs: 0.7264 (0.7618) grad_norm: 0.3847 (0.4648) time: 5.5723 data: 0.0002 max mem: 71357 -[09:11:27.755027] Epoch: [1] [3700/6500] lr: 0.000046 closs: 0.7427 (0.7620) grad_norm: 0.3847 (0.4646) time: 5.5829 data: 0.0001 max mem: 71357 -[09:12:23.464314] Epoch: [1] [3710/6500] lr: 0.000046 closs: 0.7427 (0.7619) grad_norm: 0.3666 (0.4645) time: 5.5790 data: 0.0001 max mem: 71357 -[09:13:19.156115] Epoch: [1] [3720/6500] lr: 0.000046 closs: 0.6591 (0.7618) grad_norm: 0.3587 (0.4642) time: 5.5700 data: 0.0001 max mem: 71357 -[09:14:14.876345] Epoch: [1] [3730/6500] lr: 0.000046 closs: 0.6957 (0.7617) grad_norm: 0.3575 (0.4639) time: 5.5705 data: 0.0001 max mem: 71357 -[09:15:10.762487] Epoch: [1] [3740/6500] lr: 0.000046 closs: 0.7069 (0.7615) grad_norm: 0.3504 (0.4636) time: 5.5802 data: 0.0001 max mem: 71357 -[09:16:06.591376] Epoch: [1] [3750/6500] lr: 0.000046 closs: 0.7085 (0.7616) grad_norm: 0.3295 (0.4632) time: 5.5856 data: 0.0001 max mem: 71357 -[09:17:02.377046] Epoch: [1] [3760/6500] lr: 0.000046 closs: 0.8005 (0.7617) grad_norm: 0.3338 (0.4631) time: 5.5806 data: 0.0001 max mem: 71357 -[09:17:58.234451] Epoch: [1] [3770/6500] lr: 0.000046 closs: 0.7918 (0.7618) grad_norm: 0.3461 (0.4631) time: 5.5821 data: 0.0001 max mem: 71357 -[09:18:54.023163] Epoch: [1] [3780/6500] lr: 0.000046 closs: 0.7503 (0.7619) grad_norm: 0.3461 (0.4628) time: 5.5822 data: 0.0001 max mem: 71357 -[09:19:49.736021] Epoch: [1] [3790/6500] lr: 0.000046 closs: 0.7219 (0.7617) grad_norm: 0.3838 (0.4628) time: 5.5750 data: 0.0002 max mem: 71357 -[09:20:45.437282] Epoch: [1] [3800/6500] lr: 0.000046 closs: 0.7728 (0.7621) grad_norm: 0.3838 (0.4625) time: 5.5706 data: 0.0002 max mem: 71357 -[09:21:41.168373] Epoch: [1] [3810/6500] lr: 0.000046 closs: 0.8062 (0.7621) grad_norm: 0.3776 (0.4627) time: 5.5715 data: 0.0001 max mem: 71357 -[09:22:36.975415] Epoch: [1] [3820/6500] lr: 0.000046 closs: 0.7432 (0.7622) grad_norm: 0.3782 (0.4624) time: 5.5768 data: 0.0001 max mem: 71357 -[09:23:32.914290] Epoch: [1] [3830/6500] lr: 0.000046 closs: 0.7504 (0.7620) grad_norm: 0.3533 (0.4622) time: 5.5872 data: 0.0001 max mem: 71357 -[09:24:28.594812] Epoch: [1] [3840/6500] lr: 0.000046 closs: 0.7185 (0.7618) grad_norm: 0.3782 (0.4622) time: 5.5809 data: 0.0002 max mem: 71357 -[09:25:24.364082] Epoch: [1] [3850/6500] lr: 0.000046 closs: 0.7185 (0.7618) grad_norm: 0.3533 (0.4620) time: 5.5724 data: 0.0002 max mem: 71357 -[09:26:20.100663] Epoch: [1] [3860/6500] lr: 0.000046 closs: 0.7267 (0.7616) grad_norm: 0.3744 (0.4618) time: 5.5752 data: 0.0001 max mem: 71357 -[09:27:15.928339] Epoch: [1] [3870/6500] lr: 0.000046 closs: 0.7195 (0.7616) grad_norm: 0.3995 (0.4619) time: 5.5781 data: 0.0001 max mem: 71357 -[09:28:11.670865] Epoch: [1] [3880/6500] lr: 0.000046 closs: 0.7310 (0.7615) grad_norm: 0.3729 (0.4618) time: 5.5784 data: 0.0001 max mem: 71357 -[09:29:07.413787] Epoch: [1] [3890/6500] lr: 0.000046 closs: 0.7109 (0.7613) grad_norm: 0.3729 (0.4616) time: 5.5742 data: 0.0001 max mem: 71357 -[09:30:03.241491] Epoch: [1] [3900/6500] lr: 0.000046 closs: 0.7159 (0.7613) grad_norm: 0.3739 (0.4617) time: 5.5784 data: 0.0001 max mem: 71357 -[09:30:59.019159] Epoch: [1] [3910/6500] lr: 0.000046 closs: 0.7093 (0.7612) grad_norm: 0.3819 (0.4618) time: 5.5802 data: 0.0001 max mem: 71357 -[09:31:54.797486] Epoch: [1] [3920/6500] lr: 0.000046 closs: 0.6981 (0.7612) grad_norm: 0.4453 (0.4617) time: 5.5777 data: 0.0001 max mem: 71357 -[09:32:50.515822] Epoch: [1] [3930/6500] lr: 0.000046 closs: 0.7891 (0.7612) grad_norm: 0.4498 (0.4617) time: 5.5747 data: 0.0001 max mem: 71357 -[09:33:46.285993] Epoch: [1] [3940/6500] lr: 0.000046 closs: 0.8399 (0.7613) grad_norm: 0.4643 (0.4619) time: 5.5743 data: 0.0001 max mem: 71357 -[09:34:42.060196] Epoch: [1] [3950/6500] lr: 0.000046 closs: 0.7596 (0.7614) grad_norm: 0.4053 (0.4616) time: 5.5771 data: 0.0002 max mem: 71357 -[09:35:37.892560] Epoch: [1] [3960/6500] lr: 0.000046 closs: 0.7154 (0.7614) grad_norm: 0.4053 (0.4618) time: 5.5802 data: 0.0002 max mem: 71357 -[09:36:33.555038] Epoch: [1] [3970/6500] lr: 0.000046 closs: 0.7729 (0.7614) grad_norm: 0.3912 (0.4616) time: 5.5746 data: 0.0001 max mem: 71357 -[09:37:29.253407] Epoch: [1] [3980/6500] lr: 0.000046 closs: 0.7599 (0.7612) grad_norm: 0.3912 (0.4615) time: 5.5679 data: 0.0001 max mem: 71357 -[09:38:25.028340] Epoch: [1] [3990/6500] lr: 0.000046 closs: 0.7088 (0.7613) grad_norm: 0.3892 (0.4614) time: 5.5735 data: 0.0001 max mem: 71357 -[09:39:20.898508] Epoch: [1] [4000/6500] lr: 0.000045 closs: 0.7781 (0.7614) grad_norm: 0.3892 (0.4614) time: 5.5821 data: 0.0001 max mem: 71357 -[09:40:16.589778] Epoch: [1] [4010/6500] lr: 0.000045 closs: 0.7052 (0.7612) grad_norm: 0.3709 (0.4611) time: 5.5780 data: 0.0001 max mem: 71357 -[09:41:12.368366] Epoch: [1] [4020/6500] lr: 0.000045 closs: 0.7052 (0.7612) grad_norm: 0.3709 (0.4611) time: 5.5734 data: 0.0001 max mem: 71357 -[09:42:08.075783] Epoch: [1] [4030/6500] lr: 0.000045 closs: 0.7410 (0.7613) grad_norm: 0.3698 (0.4610) time: 5.5742 data: 0.0001 max mem: 71357 -[09:43:03.790892] Epoch: [1] [4040/6500] lr: 0.000045 closs: 0.7635 (0.7613) grad_norm: 0.3481 (0.4610) time: 5.5711 data: 0.0001 max mem: 71357 -[09:43:59.611304] Epoch: [1] [4050/6500] lr: 0.000045 closs: 0.7848 (0.7613) grad_norm: 0.3679 (0.4608) time: 5.5767 data: 0.0001 max mem: 71357 -[09:44:55.346989] Epoch: [1] [4060/6500] lr: 0.000045 closs: 0.8193 (0.7616) grad_norm: 0.3679 (0.4607) time: 5.5777 data: 0.0002 max mem: 71357 -[09:45:51.144585] Epoch: [1] [4070/6500] lr: 0.000045 closs: 0.8203 (0.7615) grad_norm: 0.3481 (0.4605) time: 5.5765 data: 0.0002 max mem: 71357 -[09:46:46.827827] Epoch: [1] [4080/6500] lr: 0.000045 closs: 0.7796 (0.7615) grad_norm: 0.3621 (0.4605) time: 5.5739 data: 0.0001 max mem: 71357 -[09:47:42.580540] Epoch: [1] [4090/6500] lr: 0.000045 closs: 0.7967 (0.7616) grad_norm: 0.3748 (0.4606) time: 5.5716 data: 0.0001 max mem: 71357 -[09:48:38.297486] Epoch: [1] [4100/6500] lr: 0.000045 closs: 0.7621 (0.7619) grad_norm: 0.4406 (0.4609) time: 5.5733 data: 0.0001 max mem: 71357 -[09:49:33.991922] Epoch: [1] [4110/6500] lr: 0.000045 closs: 0.7621 (0.7618) grad_norm: 0.5227 (0.4609) time: 5.5705 data: 0.0002 max mem: 71357 -[09:50:29.739373] Epoch: [1] [4120/6500] lr: 0.000045 closs: 0.6884 (0.7617) grad_norm: 0.4406 (0.4607) time: 5.5720 data: 0.0002 max mem: 71357 -[09:51:25.522880] Epoch: [1] [4130/6500] lr: 0.000045 closs: 0.7258 (0.7617) grad_norm: 0.4095 (0.4607) time: 5.5764 data: 0.0001 max mem: 71357 -[09:52:21.422843] Epoch: [1] [4140/6500] lr: 0.000045 closs: 0.7362 (0.7617) grad_norm: 0.3938 (0.4605) time: 5.5841 data: 0.0001 max mem: 71357 -[09:53:17.172143] Epoch: [1] [4150/6500] lr: 0.000045 closs: 0.7308 (0.7617) grad_norm: 0.3742 (0.4604) time: 5.5824 data: 0.0001 max mem: 71357 -[09:54:12.914122] Epoch: [1] [4160/6500] lr: 0.000045 closs: 0.7515 (0.7616) grad_norm: 0.3742 (0.4603) time: 5.5744 data: 0.0001 max mem: 71357 -[09:55:08.625285] Epoch: [1] [4170/6500] lr: 0.000045 closs: 0.7577 (0.7616) grad_norm: 0.3742 (0.4602) time: 5.5725 data: 0.0001 max mem: 71357 -[09:56:04.404235] Epoch: [1] [4180/6500] lr: 0.000045 closs: 0.7953 (0.7617) grad_norm: 0.3742 (0.4600) time: 5.5744 data: 0.0001 max mem: 71357 -[09:57:00.149637] Epoch: [1] [4190/6500] lr: 0.000045 closs: 0.7425 (0.7615) grad_norm: 0.3742 (0.4598) time: 5.5761 data: 0.0001 max mem: 71357 -[09:57:55.891705] Epoch: [1] [4200/6500] lr: 0.000045 closs: 0.7307 (0.7614) grad_norm: 0.3675 (0.4596) time: 5.5743 data: 0.0001 max mem: 71357 -[09:58:51.703348] Epoch: [1] [4210/6500] lr: 0.000045 closs: 0.7554 (0.7616) grad_norm: 0.3542 (0.4596) time: 5.5776 data: 0.0001 max mem: 71357 -[09:59:47.523751] Epoch: [1] [4220/6500] lr: 0.000045 closs: 0.7977 (0.7617) grad_norm: 0.3714 (0.4594) time: 5.5815 data: 0.0001 max mem: 71357 -[10:00:43.137897] Epoch: [1] [4230/6500] lr: 0.000045 closs: 0.7750 (0.7617) grad_norm: 0.3772 (0.4597) time: 5.5716 data: 0.0001 max mem: 71357 -[10:01:38.861888] Epoch: [1] [4240/6500] lr: 0.000045 closs: 0.7175 (0.7616) grad_norm: 0.3714 (0.4594) time: 5.5668 data: 0.0001 max mem: 71357 -[10:02:34.575388] Epoch: [1] [4250/6500] lr: 0.000045 closs: 0.7552 (0.7615) grad_norm: 0.3486 (0.4591) time: 5.5718 data: 0.0001 max mem: 71357 -[10:03:30.284625] Epoch: [1] [4260/6500] lr: 0.000045 closs: 0.8085 (0.7617) grad_norm: 0.3462 (0.4591) time: 5.5710 data: 0.0001 max mem: 71357 -[10:04:26.115209] Epoch: [1] [4270/6500] lr: 0.000045 closs: 0.7622 (0.7617) grad_norm: 0.3462 (0.4591) time: 5.5769 data: 0.0001 max mem: 71357 -[10:05:21.870075] Epoch: [1] [4280/6500] lr: 0.000045 closs: 0.7565 (0.7615) grad_norm: 0.3631 (0.4589) time: 5.5792 data: 0.0001 max mem: 71357 -[10:06:17.586494] Epoch: [1] [4290/6500] lr: 0.000045 closs: 0.7211 (0.7615) grad_norm: 0.3673 (0.4587) time: 5.5734 data: 0.0001 max mem: 71357 -[10:07:13.372539] Epoch: [1] [4300/6500] lr: 0.000045 closs: 0.7198 (0.7613) grad_norm: 0.3553 (0.4586) time: 5.5750 data: 0.0001 max mem: 71357 -[10:08:09.184606] Epoch: [1] [4310/6500] lr: 0.000045 closs: 0.7264 (0.7612) grad_norm: 0.3453 (0.4589) time: 5.5798 data: 0.0001 max mem: 71357 -[10:09:04.953272] Epoch: [1] [4320/6500] lr: 0.000045 closs: 0.7640 (0.7613) grad_norm: 0.3516 (0.4588) time: 5.5789 data: 0.0001 max mem: 71357 -[10:10:00.689997] Epoch: [1] [4330/6500] lr: 0.000045 closs: 0.7334 (0.7610) grad_norm: 0.3783 (0.4586) time: 5.5752 data: 0.0001 max mem: 71357 -[10:10:56.358374] Epoch: [1] [4340/6500] lr: 0.000045 closs: 0.7335 (0.7611) grad_norm: 0.4012 (0.4585) time: 5.5702 data: 0.0001 max mem: 71357 -[10:11:52.042943] Epoch: [1] [4350/6500] lr: 0.000045 closs: 0.7335 (0.7611) grad_norm: 0.4101 (0.4585) time: 5.5675 data: 0.0001 max mem: 71357 -[10:12:47.814019] Epoch: [1] [4360/6500] lr: 0.000045 closs: 0.7259 (0.7610) grad_norm: 0.3995 (0.4584) time: 5.5727 data: 0.0001 max mem: 71357 -[10:13:43.550859] Epoch: [1] [4370/6500] lr: 0.000045 closs: 0.7493 (0.7610) grad_norm: 0.3995 (0.4584) time: 5.5753 data: 0.0001 max mem: 71357 -[10:14:39.253856] Epoch: [1] [4380/6500] lr: 0.000045 closs: 0.7821 (0.7611) grad_norm: 0.4144 (0.4587) time: 5.5719 data: 0.0001 max mem: 71357 -[10:15:35.182285] Epoch: [1] [4390/6500] lr: 0.000045 closs: 0.7537 (0.7612) grad_norm: 0.3995 (0.4589) time: 5.5815 data: 0.0001 max mem: 71357 -[10:16:30.985009] Epoch: [1] [4400/6500] lr: 0.000045 closs: 0.7537 (0.7611) grad_norm: 0.3660 (0.4588) time: 5.5864 data: 0.0001 max mem: 71357 -[10:17:26.686911] Epoch: [1] [4410/6500] lr: 0.000045 closs: 0.7020 (0.7611) grad_norm: 0.3723 (0.4587) time: 5.5751 data: 0.0001 max mem: 71357 -[10:18:22.372236] Epoch: [1] [4420/6500] lr: 0.000045 closs: 0.7378 (0.7611) grad_norm: 0.3589 (0.4585) time: 5.5692 data: 0.0001 max mem: 71357 -[10:19:18.157527] Epoch: [1] [4430/6500] lr: 0.000045 closs: 0.7378 (0.7611) grad_norm: 0.3699 (0.4583) time: 5.5734 data: 0.0001 max mem: 71357 -[10:20:14.052884] Epoch: [1] [4440/6500] lr: 0.000044 closs: 0.6983 (0.7610) grad_norm: 0.3723 (0.4585) time: 5.5839 data: 0.0002 max mem: 71357 -WARNING:torch.distributed.run: -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -| distributed init (rank 1): env://, gpu 1 -| distributed init (rank 0): env://, gpu 0 -[10:21:09.403398] > initializing model parallel with size 1 -[10:21:09.403479] > initializing ddp with size 2 -[10:21:09.403488] > initializing pipeline with size 1 -[10:21:09.450720] job dir: /data/liuyijiang/mmlab/krisliu/LLaMA2-Accessory/accessory -[10:21:09.450818] Namespace(batch_size=4, -accum_iter=2, -llama_type='llama_peft', -llama_config=['../checkpoints/llama2/Llama-2-70b/params.json'], -no_visual=True, -tokenizer_path='../checkpoints/llama2/Llama-2-70b/tokenizer.model', -pretrained_path='../checkpoints/llama2/Llama-2-70b/', -pretrained_type='meta_ori', -weight_decay=0.02, -lr=5e-05, -min_lr=5e-06, -epochs=4, -warmup_epochs=1.0, -clip_grad=2, -max_words=512, -dialog=False, -data_config='configs/data/finetune/sg/alpaca.yaml', -output_dir='output/finetune/sg/alpaca_llamaPeft_normBias_QF_512_70B', -log_dir='./output_dir', -save_interval=1, -device='cuda', -seed=0, -resume='', -num_workers=8, -pin_mem=True, -world_size=2, -local_rank=-1, -dist_on_itp=False, -dist_url='env://', -model_parallel_size=1, -data_parallel='sdp', -precision='bf16', -checkpointing=True, -quant=True, -rank=0, -gpu=0, -distributed=True, -dist_backend='nccl') -[10:21:09.451703] Start initialization. -[10:21:09.466825] Model Args: - ModelArgs(dim=8192, n_layers=80, n_heads=64, n_kv_heads=8, vocab_size=32000, multiple_of=4096, ffn_dim_multiplier=1.3, norm_eps=1e-05, max_batch_size=32, max_seq_len=512, lora_rank=-1, bias_tuning=True) -[10:21:09.915438] Epoch: [1] [4450/6500] lr: 0.000044 closs: 0.6983 (0.7610) grad_norm: 0.3699 (0.4583) time: 5.5878 data: 0.0002 max mem: 71357 -[10:22:05.668596] Epoch: [1] [4460/6500] lr: 0.000044 closs: 0.7467 (0.7610) grad_norm: 0.3758 (0.4583) time: 5.5807 data: 0.0002 max mem: 71357 -[10:23:01.470643] Epoch: [1] [4470/6500] lr: 0.000044 closs: 0.7482 (0.7610) grad_norm: 0.3644 (0.4581) time: 5.5776 data: 0.0002 max mem: 71357 -[10:23:57.292617] Epoch: [1] [4480/6500] lr: 0.000044 closs: 0.7199 (0.7609) grad_norm: 0.3613 (0.4579) time: 5.5811 data: 0.0002 max mem: 71357 -[10:24:53.185106] Epoch: [1] [4490/6500] lr: 0.000044 closs: 0.8161 (0.7611) grad_norm: 0.3613 (0.4580) time: 5.5856 data: 0.0001 max mem: 71357 -[10:25:48.906594] Epoch: [1] [4500/6500] lr: 0.000044 closs: 0.8007 (0.7612) grad_norm: 0.3793 (0.4581) time: 5.5806 data: 0.0002 max mem: 71357 -[10:26:44.655847] Epoch: [1] [4510/6500] lr: 0.000044 closs: 0.7206 (0.7613) grad_norm: 0.3905 (0.4579) time: 5.5735 data: 0.0002 max mem: 71357 -[10:27:40.351074] Epoch: [1] [4520/6500] lr: 0.000044 closs: 0.7976 (0.7615) grad_norm: 0.4285 (0.4580) time: 5.5721 data: 0.0002 max mem: 71357 -[10:28:36.191258] Epoch: [1] [4530/6500] lr: 0.000044 closs: 0.7976 (0.7615) grad_norm: 0.4008 (0.4579) time: 5.5767 data: 0.0001 max mem: 71357 -[10:29:31.968038] Epoch: [1] [4540/6500] lr: 0.000044 closs: 0.7567 (0.7614) grad_norm: 0.3905 (0.4577) time: 5.5807 data: 0.0001 max mem: 71357 -[10:30:27.763567] Epoch: [1] [4550/6500] lr: 0.000044 closs: 0.7294 (0.7615) grad_norm: 0.3837 (0.4576) time: 5.5785 data: 0.0001 max mem: 71357 -[10:31:23.432711] Epoch: [1] [4560/6500] lr: 0.000044 closs: 0.7294 (0.7615) grad_norm: 0.3770 (0.4577) time: 5.5731 data: 0.0001 max mem: 71357 -[10:32:19.140978] Epoch: [1] [4570/6500] lr: 0.000044 closs: 0.7468 (0.7616) grad_norm: 0.3770 (0.4577) time: 5.5688 data: 0.0001 max mem: 71357 -[10:33:14.948775] Epoch: [1] [4580/6500] lr: 0.000044 closs: 0.7468 (0.7616) grad_norm: 0.3815 (0.4577) time: 5.5757 data: 0.0001 max mem: 71357 -[10:34:10.772732] Epoch: [1] [4590/6500] lr: 0.000044 closs: 0.7741 (0.7616) grad_norm: 0.3815 (0.4577) time: 5.5815 data: 0.0001 max mem: 71357 -[10:35:06.473382] Epoch: [1] [4600/6500] lr: 0.000044 closs: 0.7606 (0.7615) grad_norm: 0.3815 (0.4576) time: 5.5761 data: 0.0002 max mem: 71357 -[10:36:02.235458] Epoch: [1] [4610/6500] lr: 0.000044 closs: 0.6930 (0.7615) grad_norm: 0.3815 (0.4577) time: 5.5730 data: 0.0002 max mem: 71357 -[10:36:58.036756] Epoch: [1] [4620/6500] lr: 0.000044 closs: 0.7670 (0.7616) grad_norm: 0.4095 (0.4576) time: 5.5780 data: 0.0001 max mem: 71357 -[10:37:53.792580] Epoch: [1] [4630/6500] lr: 0.000044 closs: 0.7398 (0.7616) grad_norm: 0.4037 (0.4574) time: 5.5778 data: 0.0001 max mem: 71357 -[10:38:49.606204] Epoch: [1] [4640/6500] lr: 0.000044 closs: 0.7056 (0.7615) grad_norm: 0.3881 (0.4572) time: 5.5784 data: 0.0001 max mem: 71357 -[10:39:45.383787] Epoch: [1] [4650/6500] lr: 0.000044 closs: 0.7542 (0.7616) grad_norm: 0.3881 (0.4574) time: 5.5794 data: 0.0002 max mem: 71357 -[10:40:41.177173] Epoch: [1] [4660/6500] lr: 0.000044 closs: 0.7175 (0.7616) grad_norm: 0.3625 (0.4574) time: 5.5785 data: 0.0001 max mem: 71357 -[10:41:36.903399] Epoch: [1] [4670/6500] lr: 0.000044 closs: 0.6899 (0.7614) grad_norm: 0.3625 (0.4574) time: 5.5759 data: 0.0001 max mem: 71357 -[10:42:32.710916] Epoch: [1] [4680/6500] lr: 0.000044 closs: 0.7280 (0.7615) grad_norm: 0.3585 (0.4571) time: 5.5765 data: 0.0001 max mem: 71357 -[10:43:28.526448] Epoch: [1] [4690/6500] lr: 0.000044 closs: 0.7319 (0.7615) grad_norm: 0.3625 (0.4570) time: 5.5810 data: 0.0001 max mem: 71357 -[10:44:24.274780] Epoch: [1] [4700/6500] lr: 0.000044 closs: 0.7385 (0.7614) grad_norm: 0.3723 (0.4571) time: 5.5781 data: 0.0001 max mem: 71357 -[10:45:20.118274] Epoch: [1] [4710/6500] lr: 0.000044 closs: 0.7702 (0.7616) grad_norm: 0.3723 (0.4570) time: 5.5795 data: 0.0002 max mem: 71357 -[10:46:15.876910] Epoch: [1] [4720/6500] lr: 0.000044 closs: 0.8632 (0.7617) grad_norm: 0.3725 (0.4569) time: 5.5800 data: 0.0002 max mem: 71357 -[10:47:11.670606] Epoch: [1] [4730/6500] lr: 0.000044 closs: 0.7829 (0.7617) grad_norm: 0.3538 (0.4566) time: 5.5775 data: 0.0001 max mem: 71357 -[10:48:07.502727] Epoch: [1] [4740/6500] lr: 0.000044 closs: 0.8662 (0.7620) grad_norm: 0.3451 (0.4564) time: 5.5812 data: 0.0001 max mem: 71357 -[10:49:03.350525] Epoch: [1] [4750/6500] lr: 0.000044 closs: 0.8316 (0.7620) grad_norm: 0.3455 (0.4564) time: 5.5839 data: 0.0001 max mem: 71357 -[10:49:59.088820] Epoch: [1] [4760/6500] lr: 0.000044 closs: 0.7709 (0.7620) grad_norm: 0.3764 (0.4564) time: 5.5792 data: 0.0002 max mem: 71357 -[10:50:54.918656] Epoch: [1] [4770/6500] lr: 0.000044 closs: 0.7709 (0.7620) grad_norm: 0.4139 (0.4562) time: 5.5783 data: 0.0002 max mem: 71357 -[10:51:50.661923] Epoch: [1] [4780/6500] lr: 0.000044 closs: 0.7886 (0.7621) grad_norm: 0.4284 (0.4565) time: 5.5786 data: 0.0001 max mem: 71357 -[10:52:46.468233] Epoch: [1] [4790/6500] lr: 0.000044 closs: 0.8177 (0.7621) grad_norm: 0.4017 (0.4562) time: 5.5774 data: 0.0001 max mem: 71357 -[10:53:42.298368] Epoch: [1] [4800/6500] lr: 0.000044 closs: 0.8177 (0.7623) grad_norm: 0.3854 (0.4563) time: 5.5817 data: 0.0001 max mem: 71357 -[10:54:37.963093] Epoch: [1] [4810/6500] lr: 0.000044 closs: 0.7960 (0.7625) grad_norm: 0.3854 (0.4572) time: 5.5746 data: 0.0001 max mem: 71357 -[10:55:33.724578] Epoch: [1] [4820/6500] lr: 0.000044 closs: 0.7233 (0.7623) grad_norm: 0.3701 (0.4571) time: 5.5712 data: 0.0001 max mem: 71357 -[10:56:29.502489] Epoch: [1] [4830/6500] lr: 0.000044 closs: 0.7313 (0.7624) grad_norm: 0.3720 (0.4579) time: 5.5769 data: 0.0001 max mem: 71357 -[10:57:25.285468] Epoch: [1] [4840/6500] lr: 0.000043 closs: 0.7270 (0.7623) grad_norm: 0.3717 (0.4578) time: 5.5780 data: 0.0001 max mem: 71357 -[10:58:20.999566] Epoch: [1] [4850/6500] lr: 0.000043 closs: 0.7105 (0.7623) grad_norm: 0.4146 (0.4578) time: 5.5747 data: 0.0001 max mem: 71357 -[10:59:16.743808] Epoch: [1] [4860/6500] lr: 0.000043 closs: 0.7434 (0.7624) grad_norm: 0.4005 (0.4577) time: 5.5728 data: 0.0001 max mem: 71357 -[11:00:12.471000] Epoch: [1] [4870/6500] lr: 0.000043 closs: 0.8188 (0.7625) grad_norm: 0.4129 (0.4576) time: 5.5734 data: 0.0002 max mem: 71357 -[11:01:08.334659] Epoch: [1] [4880/6500] lr: 0.000043 closs: 0.7880 (0.7625) grad_norm: 0.3817 (0.4573) time: 5.5794 data: 0.0002 max mem: 71357 -[11:02:04.159214] Epoch: [1] [4890/6500] lr: 0.000043 closs: 0.7651 (0.7626) grad_norm: 0.3529 (0.4571) time: 5.5843 data: 0.0001 max mem: 71357 -[11:02:59.872216] Epoch: [1] [4900/6500] lr: 0.000043 closs: 0.7651 (0.7626) grad_norm: 0.3376 (0.4570) time: 5.5768 data: 0.0001 max mem: 71357 -[11:03:55.655000] Epoch: [1] [4910/6500] lr: 0.000043 closs: 0.7754 (0.7626) grad_norm: 0.3316 (0.4568) time: 5.5747 data: 0.0001 max mem: 71357 -[11:04:51.487749] Epoch: [1] [4920/6500] lr: 0.000043 closs: 0.7336 (0.7625) grad_norm: 0.3376 (0.4566) time: 5.5806 data: 0.0002 max mem: 71357 -[11:05:47.384044] Epoch: [1] [4930/6500] lr: 0.000043 closs: 0.7084 (0.7624) grad_norm: 0.3480 (0.4565) time: 5.5863 data: 0.0002 max mem: 71357 -[11:06:43.177011] Epoch: [1] [4940/6500] lr: 0.000043 closs: 0.7459 (0.7623) grad_norm: 0.3427 (0.4563) time: 5.5843 data: 0.0001 max mem: 71357 -[11:07:38.843228] Epoch: [1] [4950/6500] lr: 0.000043 closs: 0.7459 (0.7623) grad_norm: 0.3660 (0.4564) time: 5.5728 data: 0.0001 max mem: 71357 -[11:08:34.583423] Epoch: [1] [4960/6500] lr: 0.000043 closs: 0.7790 (0.7624) grad_norm: 0.4092 (0.4563) time: 5.5702 data: 0.0001 max mem: 71357 -[11:09:30.439676] Epoch: [1] [4970/6500] lr: 0.000043 closs: 0.7991 (0.7624) grad_norm: 0.4092 (0.4561) time: 5.5797 data: 0.0001 max mem: 71357 -[11:10:26.188771] Epoch: [1] [4980/6500] lr: 0.000043 closs: 0.7671 (0.7624) grad_norm: 0.3963 (0.4561) time: 5.5801 data: 0.0002 max mem: 71357 -[11:11:21.946425] Epoch: [1] [4990/6500] lr: 0.000043 closs: 0.7342 (0.7623) grad_norm: 0.3800 (0.4560) time: 5.5752 data: 0.0002 max mem: 71357 -[11:12:17.744689] Epoch: [1] [5000/6500] lr: 0.000043 closs: 0.7342 (0.7622) grad_norm: 0.3800 (0.4565) time: 5.5777 data: 0.0001 max mem: 71357 -[11:13:13.585475] Epoch: [1] [5010/6500] lr: 0.000043 closs: 0.7470 (0.7621) grad_norm: 0.3830 (0.4563) time: 5.5818 data: 0.0001 max mem: 71357 -[11:14:09.395279] Epoch: [1] [5020/6500] lr: 0.000043 closs: 0.7470 (0.7621) grad_norm: 0.3689 (0.4561) time: 5.5824 data: 0.0001 max mem: 71357 -[11:15:05.164880] Epoch: [1] [5030/6500] lr: 0.000043 closs: 0.7894 (0.7623) grad_norm: 0.3559 (0.4563) time: 5.5788 data: 0.0002 max mem: 71357 -[11:16:00.882700] Epoch: [1] [5040/6500] lr: 0.000043 closs: 0.7406 (0.7621) grad_norm: 0.3560 (0.4567) time: 5.5742 data: 0.0002 max mem: 71357 -[11:16:56.643127] Epoch: [1] [5050/6500] lr: 0.000043 closs: 0.7206 (0.7621) grad_norm: 0.3608 (0.4565) time: 5.5738 data: 0.0001 max mem: 71357 -[11:17:52.504961] Epoch: [1] [5060/6500] lr: 0.000043 closs: 0.7532 (0.7621) grad_norm: 0.3612 (0.4564) time: 5.5810 data: 0.0001 max mem: 71357 -[11:18:48.234806] Epoch: [1] [5070/6500] lr: 0.000043 closs: 0.7623 (0.7620) grad_norm: 0.3612 (0.4564) time: 5.5795 data: 0.0001 max mem: 71357 -[11:19:44.087526] Epoch: [1] [5080/6500] lr: 0.000043 closs: 0.6898 (0.7620) grad_norm: 0.3674 (0.4563) time: 5.5790 data: 0.0001 max mem: 71357 -[11:20:39.901787] Epoch: [1] [5090/6500] lr: 0.000043 closs: 0.6929 (0.7620) grad_norm: 0.3674 (0.4563) time: 5.5832 data: 0.0001 max mem: 71357 -[11:21:35.717532] Epoch: [1] [5100/6500] lr: 0.000043 closs: 0.7438 (0.7620) grad_norm: 0.3975 (0.4565) time: 5.5814 data: 0.0001 max mem: 71357 -[11:22:31.404559] Epoch: [1] [5110/6500] lr: 0.000043 closs: 0.7661 (0.7621) grad_norm: 0.4491 (0.4566) time: 5.5750 data: 0.0001 max mem: 71357 -[11:23:27.158982] Epoch: [1] [5120/6500] lr: 0.000043 closs: 0.8052 (0.7622) grad_norm: 0.4491 (0.4565) time: 5.5720 data: 0.0001 max mem: 71357 -[11:24:22.940301] Epoch: [1] [5130/6500] lr: 0.000043 closs: 0.7489 (0.7619) grad_norm: 0.4491 (0.4564) time: 5.5767 data: 0.0001 max mem: 71357 -[11:25:18.734962] Epoch: [1] [5140/6500] lr: 0.000043 closs: 0.7677 (0.7621) grad_norm: 0.4062 (0.4563) time: 5.5787 data: 0.0002 max mem: 71357 -[11:26:14.603090] Epoch: [1] [5150/6500] lr: 0.000043 closs: 0.7984 (0.7621) grad_norm: 0.3606 (0.4563) time: 5.5830 data: 0.0002 max mem: 71357 -[11:27:10.315042] Epoch: [1] [5160/6500] lr: 0.000043 closs: 0.7870 (0.7623) grad_norm: 0.3626 (0.4561) time: 5.5789 data: 0.0001 max mem: 71357 -[11:28:05.963749] Epoch: [1] [5170/6500] lr: 0.000043 closs: 0.8181 (0.7624) grad_norm: 0.3642 (0.4562) time: 5.5679 data: 0.0001 max mem: 71357 -[11:29:01.772870] Epoch: [1] [5180/6500] lr: 0.000043 closs: 0.7284 (0.7623) grad_norm: 0.3776 (0.4562) time: 5.5728 data: 0.0001 max mem: 71357 -[11:29:57.532695] Epoch: [1] [5190/6500] lr: 0.000043 closs: 0.7601 (0.7623) grad_norm: 0.3662 (0.4561) time: 5.5783 data: 0.0002 max mem: 71357 -[11:30:53.205321] Epoch: [1] [5200/6500] lr: 0.000043 closs: 0.7734 (0.7623) grad_norm: 0.4361 (0.4561) time: 5.5715 data: 0.0002 max mem: 71357 -[11:31:48.890650] Epoch: [1] [5210/6500] lr: 0.000043 closs: 0.7016 (0.7622) grad_norm: 0.4014 (0.4561) time: 5.5678 data: 0.0001 max mem: 71357 -[11:32:44.490303] Epoch: [1] [5220/6500] lr: 0.000043 closs: 0.7215 (0.7623) grad_norm: 0.4014 (0.4563) time: 5.5642 data: 0.0001 max mem: 71357 -[11:33:40.288737] Epoch: [1] [5230/6500] lr: 0.000042 closs: 0.6987 (0.7620) grad_norm: 0.4014 (0.4562) time: 5.5698 data: 0.0001 max mem: 71357 -[11:34:36.201586] Epoch: [1] [5240/6500] lr: 0.000042 closs: 0.7130 (0.7620) grad_norm: 0.3873 (0.4561) time: 5.5855 data: 0.0001 max mem: 71357 -[11:35:31.926282] Epoch: [1] [5250/6500] lr: 0.000042 closs: 0.7390 (0.7619) grad_norm: 0.3751 (0.4559) time: 5.5818 data: 0.0002 max mem: 71357 -[11:36:27.605601] Epoch: [1] [5260/6500] lr: 0.000042 closs: 0.6863 (0.7618) grad_norm: 0.3751 (0.4560) time: 5.5701 data: 0.0002 max mem: 71357 -[11:37:23.356717] Epoch: [1] [5270/6500] lr: 0.000042 closs: 0.7545 (0.7619) grad_norm: 0.3868 (0.4560) time: 5.5714 data: 0.0001 max mem: 71357 -[11:38:19.052455] Epoch: [1] [5280/6500] lr: 0.000042 closs: 0.7771 (0.7618) grad_norm: 0.4176 (0.4562) time: 5.5723 data: 0.0001 max mem: 71357 -[11:39:14.810553] Epoch: [1] [5290/6500] lr: 0.000042 closs: 0.7234 (0.7619) grad_norm: 0.4176 (0.4560) time: 5.5726 data: 0.0001 max mem: 71357 -[11:40:10.478168] Epoch: [1] [5300/6500] lr: 0.000042 closs: 0.7234 (0.7619) grad_norm: 0.3894 (0.4561) time: 5.5712 data: 0.0002 max mem: 71357 -[11:41:06.204522] Epoch: [1] [5310/6500] lr: 0.000042 closs: 0.8350 (0.7620) grad_norm: 0.4485 (0.4564) time: 5.5696 data: 0.0002 max mem: 71357 -[11:42:02.028377] Epoch: [1] [5320/6500] lr: 0.000042 closs: 0.8021 (0.7620) grad_norm: 0.3894 (0.4565) time: 5.5774 data: 0.0001 max mem: 71357 -[11:42:57.762208] Epoch: [1] [5330/6500] lr: 0.000042 closs: 0.7210 (0.7620) grad_norm: 0.4442 (0.4564) time: 5.5778 data: 0.0001 max mem: 71357 -[11:43:53.467868] Epoch: [1] [5340/6500] lr: 0.000042 closs: 0.7326 (0.7621) grad_norm: 0.4205 (0.4564) time: 5.5719 data: 0.0001 max mem: 71357 -[11:44:49.249976] Epoch: [1] [5350/6500] lr: 0.000042 closs: 0.7308 (0.7620) grad_norm: 0.3894 (0.4562) time: 5.5743 data: 0.0001 max mem: 71357 -[11:45:44.957778] Epoch: [1] [5360/6500] lr: 0.000042 closs: 0.6752 (0.7620) grad_norm: 0.3891 (0.4562) time: 5.5744 data: 0.0001 max mem: 71357 -[11:46:40.748059] Epoch: [1] [5370/6500] lr: 0.000042 closs: 0.6930 (0.7618) grad_norm: 0.3779 (0.4562) time: 5.5748 data: 0.0001 max mem: 71357 -[11:47:36.454319] Epoch: [1] [5380/6500] lr: 0.000042 closs: 0.7712 (0.7620) grad_norm: 0.3879 (0.4562) time: 5.5747 data: 0.0001 max mem: 71357 -[11:48:32.242290] Epoch: [1] [5390/6500] lr: 0.000042 closs: 0.8651 (0.7621) grad_norm: 0.3879 (0.4562) time: 5.5746 data: 0.0001 max mem: 71357 -[11:49:27.957643] Epoch: [1] [5400/6500] lr: 0.000042 closs: 0.7200 (0.7620) grad_norm: 0.4342 (0.4562) time: 5.5751 data: 0.0001 max mem: 71357 -[11:50:23.766466] Epoch: [1] [5410/6500] lr: 0.000042 closs: 0.7646 (0.7621) grad_norm: 0.4439 (0.4564) time: 5.5761 data: 0.0001 max mem: 71357 -[11:51:19.479821] Epoch: [1] [5420/6500] lr: 0.000042 closs: 0.7639 (0.7620) grad_norm: 0.4558 (0.4567) time: 5.5760 data: 0.0001 max mem: 71357 -[11:52:15.244104] Epoch: [1] [5430/6500] lr: 0.000042 closs: 0.7082 (0.7620) grad_norm: 0.4558 (0.4565) time: 5.5738 data: 0.0001 max mem: 71357 -[11:53:10.897641] Epoch: [1] [5440/6500] lr: 0.000042 closs: 0.7920 (0.7621) grad_norm: 0.4017 (0.4564) time: 5.5708 data: 0.0001 max mem: 71357 -[11:54:06.694640] Epoch: [1] [5450/6500] lr: 0.000042 closs: 0.7965 (0.7622) grad_norm: 0.4201 (0.4564) time: 5.5724 data: 0.0001 max mem: 71357 -[11:55:02.484616] Epoch: [1] [5460/6500] lr: 0.000042 closs: 0.7598 (0.7622) grad_norm: 0.3805 (0.4563) time: 5.5793 data: 0.0001 max mem: 71357 -[11:55:58.257271] Epoch: [1] [5470/6500] lr: 0.000042 closs: 0.7568 (0.7622) grad_norm: 0.3925 (0.4562) time: 5.5781 data: 0.0001 max mem: 71357 -[11:56:53.980040] Epoch: [1] [5480/6500] lr: 0.000042 closs: 0.7271 (0.7621) grad_norm: 0.3925 (0.4563) time: 5.5747 data: 0.0001 max mem: 71357 -[11:57:49.775101] Epoch: [1] [5490/6500] lr: 0.000042 closs: 0.7271 (0.7620) grad_norm: 0.3860 (0.4561) time: 5.5758 data: 0.0001 max mem: 71357 -[11:58:45.515824] Epoch: [1] [5500/6500] lr: 0.000042 closs: 0.7487 (0.7620) grad_norm: 0.3860 (0.4560) time: 5.5767 data: 0.0001 max mem: 71357 -[11:59:41.216243] Epoch: [1] [5510/6500] lr: 0.000042 closs: 0.7174 (0.7619) grad_norm: 0.4166 (0.4561) time: 5.5720 data: 0.0001 max mem: 71357 -[12:00:36.966815] Epoch: [1] [5520/6500] lr: 0.000042 closs: 0.7540 (0.7621) grad_norm: 0.3887 (0.4560) time: 5.5725 data: 0.0001 max mem: 71357 -[12:01:32.831173] Epoch: [1] [5530/6500] lr: 0.000042 closs: 0.7546 (0.7621) grad_norm: 0.4038 (0.4560) time: 5.5807 data: 0.0001 max mem: 71357 -[12:02:28.647159] Epoch: [1] [5540/6500] lr: 0.000042 closs: 0.7516 (0.7621) grad_norm: 0.4075 (0.4560) time: 5.5839 data: 0.0001 max mem: 71357 -[12:03:24.364310] Epoch: [1] [5550/6500] lr: 0.000042 closs: 0.7670 (0.7621) grad_norm: 0.3887 (0.4558) time: 5.5766 data: 0.0001 max mem: 71357 -[12:04:20.155378] Epoch: [1] [5560/6500] lr: 0.000042 closs: 0.7699 (0.7622) grad_norm: 0.3411 (0.4556) time: 5.5753 data: 0.0001 max mem: 71357 -[12:05:15.851908] Epoch: [1] [5570/6500] lr: 0.000042 closs: 0.7497 (0.7622) grad_norm: 0.3411 (0.4555) time: 5.5743 data: 0.0001 max mem: 71357 -[12:06:11.602612] Epoch: [1] [5580/6500] lr: 0.000042 closs: 0.7377 (0.7622) grad_norm: 0.3426 (0.4556) time: 5.5723 data: 0.0001 max mem: 71357 -[12:07:07.492817] Epoch: [1] [5590/6500] lr: 0.000041 closs: 0.7341 (0.7622) grad_norm: 0.3762 (0.4555) time: 5.5820 data: 0.0001 max mem: 71357 -[12:08:03.293439] Epoch: [1] [5600/6500] lr: 0.000041 closs: 0.7341 (0.7623) grad_norm: 0.4020 (0.4553) time: 5.5845 data: 0.0001 max mem: 71357 -[12:08:58.953646] Epoch: [1] [5610/6500] lr: 0.000041 closs: 0.7692 (0.7621) grad_norm: 0.4286 (0.4556) time: 5.5730 data: 0.0001 max mem: 71357 -[12:09:54.762282] Epoch: [1] [5620/6500] lr: 0.000041 closs: 0.7133 (0.7621) grad_norm: 0.3748 (0.4555) time: 5.5733 data: 0.0001 max mem: 71357 -[12:10:50.637061] Epoch: [1] [5630/6500] lr: 0.000041 closs: 0.7529 (0.7622) grad_norm: 0.3843 (0.4555) time: 5.5841 data: 0.0001 max mem: 71357 -[12:11:46.338076] Epoch: [1] [5640/6500] lr: 0.000041 closs: 0.8054 (0.7622) grad_norm: 0.3843 (0.4553) time: 5.5787 data: 0.0001 max mem: 71357 -[12:12:42.052979] Epoch: [1] [5650/6500] lr: 0.000041 closs: 0.8054 (0.7623) grad_norm: 0.3795 (0.4552) time: 5.5707 data: 0.0001 max mem: 71357 -[12:13:37.757138] Epoch: [1] [5660/6500] lr: 0.000041 closs: 0.7757 (0.7623) grad_norm: 0.3795 (0.4552) time: 5.5709 data: 0.0001 max mem: 71357 -[12:14:33.519114] Epoch: [1] [5670/6500] lr: 0.000041 closs: 0.7771 (0.7623) grad_norm: 0.3751 (0.4552) time: 5.5732 data: 0.0001 max mem: 71357 -[12:15:29.262342] Epoch: [1] [5680/6500] lr: 0.000041 closs: 0.8003 (0.7625) grad_norm: 0.3742 (0.4550) time: 5.5752 data: 0.0001 max mem: 71357 -[12:16:24.960305] Epoch: [1] [5690/6500] lr: 0.000041 closs: 0.7774 (0.7625) grad_norm: 0.3751 (0.4549) time: 5.5720 data: 0.0001 max mem: 71357 -[12:17:20.818451] Epoch: [1] [5700/6500] lr: 0.000041 closs: 0.7509 (0.7625) grad_norm: 0.3829 (0.4548) time: 5.5777 data: 0.0001 max mem: 71357 -[12:18:16.589827] Epoch: [1] [5710/6500] lr: 0.000041 closs: 0.7242 (0.7625) grad_norm: 0.3742 (0.4547) time: 5.5814 data: 0.0001 max mem: 71357 -[12:19:12.364366] Epoch: [1] [5720/6500] lr: 0.000041 closs: 0.7242 (0.7624) grad_norm: 0.4329 (0.4547) time: 5.5772 data: 0.0002 max mem: 71357 -[12:20:08.204706] Epoch: [1] [5730/6500] lr: 0.000041 closs: 0.7491 (0.7624) grad_norm: 0.3653 (0.4545) time: 5.5806 data: 0.0002 max mem: 71357 -[12:21:03.861751] Epoch: [1] [5740/6500] lr: 0.000041 closs: 0.7838 (0.7624) grad_norm: 0.3653 (0.4545) time: 5.5747 data: 0.0001 max mem: 71357 -[12:21:59.718259] Epoch: [1] [5750/6500] lr: 0.000041 closs: 0.7416 (0.7624) grad_norm: 0.3975 (0.4546) time: 5.5756 data: 0.0001 max mem: 71357 -[12:22:55.524356] Epoch: [1] [5760/6500] lr: 0.000041 closs: 0.7056 (0.7623) grad_norm: 0.3606 (0.4545) time: 5.5830 data: 0.0001 max mem: 71357 -[12:23:51.380407] Epoch: [1] [5770/6500] lr: 0.000041 closs: 0.7723 (0.7624) grad_norm: 0.3858 (0.4544) time: 5.5830 data: 0.0001 max mem: 71357 -[12:24:47.150261] Epoch: [1] [5780/6500] lr: 0.000041 closs: 0.7954 (0.7623) grad_norm: 0.4207 (0.4544) time: 5.5812 data: 0.0001 max mem: 71357 -[12:25:42.827316] Epoch: [1] [5790/6500] lr: 0.000041 closs: 0.6956 (0.7622) grad_norm: 0.3718 (0.4544) time: 5.5722 data: 0.0002 max mem: 71357 -[12:26:38.608622] Epoch: [1] [5800/6500] lr: 0.000041 closs: 0.6790 (0.7621) grad_norm: 0.3812 (0.4543) time: 5.5728 data: 0.0001 max mem: 71357 -[12:27:34.355703] Epoch: [1] [5810/6500] lr: 0.000041 closs: 0.6473 (0.7620) grad_norm: 0.4115 (0.4544) time: 5.5763 data: 0.0001 max mem: 71357 -[12:28:30.033082] Epoch: [1] [5820/6500] lr: 0.000041 closs: 0.6839 (0.7620) grad_norm: 0.4103 (0.4543) time: 5.5711 data: 0.0001 max mem: 71357 -[12:29:25.690915] Epoch: [1] [5830/6500] lr: 0.000041 closs: 0.7630 (0.7620) grad_norm: 0.4103 (0.4544) time: 5.5667 data: 0.0001 max mem: 71357 -[12:30:21.426603] Epoch: [1] [5840/6500] lr: 0.000041 closs: 0.7711 (0.7621) grad_norm: 0.4103 (0.4542) time: 5.5696 data: 0.0001 max mem: 71357 -[12:31:17.292434] Epoch: [1] [5850/6500] lr: 0.000041 closs: 0.8304 (0.7622) grad_norm: 0.3960 (0.4541) time: 5.5800 data: 0.0001 max mem: 71357 -[12:32:12.900412] Epoch: [1] [5860/6500] lr: 0.000041 closs: 0.7820 (0.7621) grad_norm: 0.3960 (0.4541) time: 5.5736 data: 0.0001 max mem: 71357 -[12:33:08.766672] Epoch: [1] [5870/6500] lr: 0.000041 closs: 0.6789 (0.7621) grad_norm: 0.3778 (0.4540) time: 5.5736 data: 0.0001 max mem: 71357 -[12:34:04.393916] Epoch: [1] [5880/6500] lr: 0.000041 closs: 0.7246 (0.7622) grad_norm: 0.3856 (0.4547) time: 5.5746 data: 0.0001 max mem: 71357 -[12:35:00.125761] Epoch: [1] [5890/6500] lr: 0.000041 closs: 0.7455 (0.7621) grad_norm: 0.3856 (0.4546) time: 5.5679 data: 0.0001 max mem: 71357 -[12:35:55.981470] Epoch: [1] [5900/6500] lr: 0.000041 closs: 0.7150 (0.7620) grad_norm: 0.3649 (0.4545) time: 5.5793 data: 0.0001 max mem: 71357 -[12:36:51.710443] Epoch: [1] [5910/6500] lr: 0.000041 closs: 0.7150 (0.7620) grad_norm: 0.3649 (0.4544) time: 5.5792 data: 0.0001 max mem: 71357 -[12:37:47.571872] Epoch: [1] [5920/6500] lr: 0.000041 closs: 0.6987 (0.7619) grad_norm: 0.3471 (0.4542) time: 5.5794 data: 0.0001 max mem: 71357 -[12:38:43.443859] Epoch: [1] [5930/6500] lr: 0.000040 closs: 0.6987 (0.7619) grad_norm: 0.3331 (0.4540) time: 5.5866 data: 0.0001 max mem: 71357 -[12:39:39.343282] Epoch: [1] [5940/6500] lr: 0.000040 closs: 0.7192 (0.7619) grad_norm: 0.3435 (0.4541) time: 5.5884 data: 0.0001 max mem: 71357 -[12:40:35.146723] Epoch: [1] [5950/6500] lr: 0.000040 closs: 0.7379 (0.7619) grad_norm: 0.3435 (0.4539) time: 5.5850 data: 0.0002 max mem: 71357 -[12:41:30.902370] Epoch: [1] [5960/6500] lr: 0.000040 closs: 0.7951 (0.7619) grad_norm: 0.3445 (0.4538) time: 5.5779 data: 0.0002 max mem: 71357 -[12:42:26.717992] Epoch: [1] [5970/6500] lr: 0.000040 closs: 0.7941 (0.7620) grad_norm: 0.4037 (0.4539) time: 5.5785 data: 0.0002 max mem: 71357 -[12:43:22.562358] Epoch: [1] [5980/6500] lr: 0.000040 closs: 0.7629 (0.7621) grad_norm: 0.3538 (0.4538) time: 5.5829 data: 0.0002 max mem: 71357 -[12:44:18.336240] Epoch: [1] [5990/6500] lr: 0.000040 closs: 0.7501 (0.7620) grad_norm: 0.3538 (0.4536) time: 5.5808 data: 0.0001 max mem: 71357 -[12:45:14.046587] Epoch: [1] [6000/6500] lr: 0.000040 closs: 0.6951 (0.7620) grad_norm: 0.3621 (0.4535) time: 5.5741 data: 0.0002 max mem: 71357 -[12:46:09.836934] Epoch: [1] [6010/6500] lr: 0.000040 closs: 0.6929 (0.7619) grad_norm: 0.3621 (0.4534) time: 5.5749 data: 0.0002 max mem: 71357 -[12:47:05.668098] Epoch: [1] [6020/6500] lr: 0.000040 closs: 0.7398 (0.7619) grad_norm: 0.3923 (0.4538) time: 5.5810 data: 0.0002 max mem: 71357 -[12:48:01.635878] Epoch: [1] [6030/6500] lr: 0.000040 closs: 0.7508 (0.7619) grad_norm: 0.4124 (0.4537) time: 5.5898 data: 0.0002 max mem: 71357 -[12:48:57.359248] Epoch: [1] [6040/6500] lr: 0.000040 closs: 0.7133 (0.7618) grad_norm: 0.4416 (0.4538) time: 5.5845 data: 0.0002 max mem: 71357 -[12:49:53.165180] Epoch: [1] [6050/6500] lr: 0.000040 closs: 0.7493 (0.7620) grad_norm: 0.4416 (0.4538) time: 5.5764 data: 0.0001 max mem: 71357 -[12:50:49.040989] Epoch: [1] [6060/6500] lr: 0.000040 closs: 0.8315 (0.7620) grad_norm: 0.4096 (0.4538) time: 5.5840 data: 0.0002 max mem: 71357 -[12:51:44.970720] Epoch: [1] [6070/6500] lr: 0.000040 closs: 0.7805 (0.7621) grad_norm: 0.4065 (0.4536) time: 5.5902 data: 0.0002 max mem: 71357 -[12:52:40.669392] Epoch: [1] [6080/6500] lr: 0.000040 closs: 0.8480 (0.7623) grad_norm: 0.4065 (0.4539) time: 5.5813 data: 0.0001 max mem: 71357 -[12:53:36.427897] Epoch: [1] [6090/6500] lr: 0.000040 closs: 0.8226 (0.7624) grad_norm: 0.4065 (0.4540) time: 5.5728 data: 0.0001 max mem: 71357 -[12:54:32.195913] Epoch: [1] [6100/6500] lr: 0.000040 closs: 0.7264 (0.7623) grad_norm: 0.3978 (0.4539) time: 5.5763 data: 0.0001 max mem: 71357 -[12:55:27.987231] Epoch: [1] [6110/6500] lr: 0.000040 closs: 0.8074 (0.7625) grad_norm: 0.4201 (0.4538) time: 5.5779 data: 0.0001 max mem: 71357 -[12:56:23.829834] Epoch: [1] [6120/6500] lr: 0.000040 closs: 0.8245 (0.7626) grad_norm: 0.3929 (0.4537) time: 5.5816 data: 0.0001 max mem: 71357 -[12:57:19.521242] Epoch: [1] [6130/6500] lr: 0.000040 closs: 0.8077 (0.7628) grad_norm: 0.3743 (0.4536) time: 5.5766 data: 0.0001 max mem: 71357 -[12:58:15.203234] Epoch: [1] [6140/6500] lr: 0.000040 closs: 0.7957 (0.7627) grad_norm: 0.3747 (0.4556) time: 5.5686 data: 0.0001 max mem: 71357 -[12:59:10.946149] Epoch: [1] [6150/6500] lr: 0.000040 closs: 0.7957 (0.7628) grad_norm: 0.3743 (0.4555) time: 5.5712 data: 0.0001 max mem: 71357 -[13:00:06.841654] Epoch: [1] [6160/6500] lr: 0.000040 closs: 0.8200 (0.7629) grad_norm: 0.3747 (0.4554) time: 5.5818 data: 0.0001 max mem: 71357 -[13:01:02.559307] Epoch: [1] [6170/6500] lr: 0.000040 closs: 0.7666 (0.7628) grad_norm: 0.3837 (0.4553) time: 5.5806 data: 0.0001 max mem: 71357 -[13:01:58.277625] Epoch: [1] [6180/6500] lr: 0.000040 closs: 0.7439 (0.7629) grad_norm: 0.3926 (0.4553) time: 5.5717 data: 0.0001 max mem: 71357 -[13:02:54.029871] Epoch: [1] [6190/6500] lr: 0.000040 closs: 0.8000 (0.7629) grad_norm: 0.3897 (0.4552) time: 5.5734 data: 0.0001 max mem: 71357 -[13:03:49.873703] Epoch: [1] [6200/6500] lr: 0.000040 closs: 0.7667 (0.7628) grad_norm: 0.3926 (0.4552) time: 5.5797 data: 0.0001 max mem: 71357 -[13:04:45.563891] Epoch: [1] [6210/6500] lr: 0.000040 closs: 0.7751 (0.7629) grad_norm: 0.4210 (0.4552) time: 5.5766 data: 0.0001 max mem: 71357 -[13:05:41.287468] Epoch: [1] [6220/6500] lr: 0.000040 closs: 0.7010 (0.7627) grad_norm: 0.3957 (0.4551) time: 5.5706 data: 0.0002 max mem: 71357 -[13:06:37.089385] Epoch: [1] [6230/6500] lr: 0.000040 closs: 0.7090 (0.7628) grad_norm: 0.4210 (0.4550) time: 5.5762 data: 0.0002 max mem: 71357 -[13:07:32.868772] Epoch: [1] [6240/6500] lr: 0.000040 closs: 0.7651 (0.7628) grad_norm: 0.3920 (0.4550) time: 5.5790 data: 0.0001 max mem: 71357 -[13:08:28.652568] Epoch: [1] [6250/6500] lr: 0.000040 closs: 0.7620 (0.7629) grad_norm: 0.3703 (0.4549) time: 5.5781 data: 0.0001 max mem: 71357 -[13:09:24.362382] Epoch: [1] [6260/6500] lr: 0.000039 closs: 0.7858 (0.7629) grad_norm: 0.3703 (0.4548) time: 5.5746 data: 0.0001 max mem: 71357 -[13:10:20.032595] Epoch: [1] [6270/6500] lr: 0.000039 closs: 0.7305 (0.7628) grad_norm: 0.3763 (0.4549) time: 5.5689 data: 0.0001 max mem: 71357 -[13:11:15.731095] Epoch: [1] [6280/6500] lr: 0.000039 closs: 0.7532 (0.7629) grad_norm: 0.3879 (0.4550) time: 5.5683 data: 0.0001 max mem: 71357 -[13:12:11.442048] Epoch: [1] [6290/6500] lr: 0.000039 closs: 0.7507 (0.7628) grad_norm: 0.4623 (0.4552) time: 5.5703 data: 0.0001 max mem: 71357 -[13:13:07.132957] Epoch: [1] [6300/6500] lr: 0.000039 closs: 0.7333 (0.7628) grad_norm: 0.3946 (0.4550) time: 5.5700 data: 0.0001 max mem: 71357 -[13:14:02.908037] Epoch: [1] [6310/6500] lr: 0.000039 closs: 0.7169 (0.7626) grad_norm: 0.4401 (0.4552) time: 5.5732 data: 0.0001 max mem: 71357 -[13:14:58.686738] Epoch: [1] [6320/6500] lr: 0.000039 closs: 0.7381 (0.7627) grad_norm: 0.3946 (0.4552) time: 5.5776 data: 0.0001 max mem: 71357 -[13:15:54.502734] Epoch: [1] [6330/6500] lr: 0.000039 closs: 0.7568 (0.7627) grad_norm: 0.3855 (0.4550) time: 5.5796 data: 0.0001 max mem: 71357 -[13:16:50.331408] Epoch: [1] [6340/6500] lr: 0.000039 closs: 0.7370 (0.7626) grad_norm: 0.3901 (0.4549) time: 5.5821 data: 0.0001 max mem: 71357 -[13:17:46.076366] Epoch: [1] [6350/6500] lr: 0.000039 closs: 0.7401 (0.7627) grad_norm: 0.3855 (0.4551) time: 5.5786 data: 0.0001 max mem: 71357 -[13:18:41.870339] Epoch: [1] [6360/6500] lr: 0.000039 closs: 0.8105 (0.7628) grad_norm: 0.3901 (0.4551) time: 5.5768 data: 0.0002 max mem: 71357 -[13:19:37.808359] Epoch: [1] [6370/6500] lr: 0.000039 closs: 0.7998 (0.7628) grad_norm: 0.4095 (0.4550) time: 5.5865 data: 0.0002 max mem: 71357 -[13:20:33.725798] Epoch: [1] [6380/6500] lr: 0.000039 closs: 0.7342 (0.7626) grad_norm: 0.3924 (0.4549) time: 5.5926 data: 0.0002 max mem: 71357 -[13:21:29.484753] Epoch: [1] [6390/6500] lr: 0.000039 closs: 0.6765 (0.7626) grad_norm: 0.3924 (0.4548) time: 5.5836 data: 0.0002 max mem: 71357 -[13:22:25.257086] Epoch: [1] [6400/6500] lr: 0.000039 closs: 0.7452 (0.7627) grad_norm: 0.3553 (0.4548) time: 5.5764 data: 0.0002 max mem: 71357 -[13:23:21.034494] Epoch: [1] [6410/6500] lr: 0.000039 closs: 0.7509 (0.7627) grad_norm: 0.3656 (0.4547) time: 5.5774 data: 0.0001 max mem: 71357 -[13:24:17.080944] Epoch: [1] [6420/6500] lr: 0.000039 closs: 0.7457 (0.7627) grad_norm: 0.3553 (0.4545) time: 5.5911 data: 0.0001 max mem: 71357 -[13:25:12.899426] Epoch: [1] [6430/6500] lr: 0.000039 closs: 0.7345 (0.7628) grad_norm: 0.3532 (0.4545) time: 5.5931 data: 0.0001 max mem: 71357 -[13:26:08.709394] Epoch: [1] [6440/6500] lr: 0.000039 closs: 0.7518 (0.7628) grad_norm: 0.3989 (0.4544) time: 5.5813 data: 0.0001 max mem: 71357 -[13:27:04.428882] Epoch: [1] [6450/6500] lr: 0.000039 closs: 0.7228 (0.7627) grad_norm: 0.3685 (0.4544) time: 5.5763 data: 0.0001 max mem: 71357 -[13:28:00.252225] Epoch: [1] [6460/6500] lr: 0.000039 closs: 0.7051 (0.7626) grad_norm: 0.3989 (0.4543) time: 5.5770 data: 0.0001 max mem: 71357 -[13:28:56.043953] Epoch: [1] [6470/6500] lr: 0.000039 closs: 0.7380 (0.7626) grad_norm: 0.4243 (0.4545) time: 5.5807 data: 0.0001 max mem: 71357 -[13:29:51.797569] Epoch: [1] [6480/6500] lr: 0.000039 closs: 0.7187 (0.7625) grad_norm: 0.3741 (0.4543) time: 5.5772 data: 0.0001 max mem: 71357 -[13:30:47.516773] Epoch: [1] [6490/6500] lr: 0.000039 closs: 0.6832 (0.7624) grad_norm: 0.4119 (0.4546) time: 5.5735 data: 0.0001 max mem: 71357 -[13:31:38.239874] Epoch: [1] Total time: 10:04:02 -[13:31:38.277561] Averaged stats: lr: 0.000039 closs: 0.6790 (0.7624) grad_norm: 0.3968 (0.4545) -[13:31:38.437483] model saved -[13:31:39.344162] optimizer saved -[13:31:39.344627] other rank-common saved -[13:31:39.347843] rank-specific saved -[13:31:39.356891] log_dir: ./output_dir -[13:31:47.519594] Epoch: [2] [0/6500] lr: 0.000039 closs: 0.8530 (0.8530) time: 8.1620 data: 2.5124 max mem: 71357 -[13:32:43.288252] Epoch: [2] [10/6500] lr: 0.000039 closs: 0.7463 (0.7264) grad_norm: 0.3308 (0.3256) time: 5.8118 data: 0.2286 max mem: 71357 -[13:33:39.077409] Epoch: [2] [20/6500] lr: 0.000039 closs: 0.7463 (0.7621) grad_norm: 0.3325 (0.3434) time: 5.5778 data: 0.0001 max mem: 71357 -[13:34:34.867998] Epoch: [2] [30/6500] lr: 0.000039 closs: 0.7167 (0.7603) grad_norm: 0.3547 (0.4323) time: 5.5789 data: 0.0001 max mem: 71357 -[13:35:30.670234] Epoch: [2] [40/6500] lr: 0.000039 closs: 0.7167 (0.7590) grad_norm: 0.3576 (0.4284) time: 5.5796 data: 0.0001 max mem: 71357 -[13:36:26.526233] Epoch: [2] [50/6500] lr: 0.000039 closs: 0.7527 (0.7555) grad_norm: 0.3960 (0.4176) time: 5.5828 data: 0.0001 max mem: 71357 -[13:37:22.300149] Epoch: [2] [60/6500] lr: 0.000039 closs: 0.7448 (0.7551) grad_norm: 0.4199 (0.4239) time: 5.5814 data: 0.0001 max mem: 71357 -[13:38:18.028707] Epoch: [2] [70/6500] lr: 0.000039 closs: 0.7028 (0.7479) grad_norm: 0.3856 (0.4157) time: 5.5750 data: 0.0001 max mem: 71357 -[13:39:13.821819] Epoch: [2] [80/6500] lr: 0.000038 closs: 0.7377 (0.7468) grad_norm: 0.3691 (0.4100) time: 5.5760 data: 0.0001 max mem: 71357 -[13:40:09.683506] Epoch: [2] [90/6500] lr: 0.000038 closs: 0.7571 (0.7481) grad_norm: 0.3691 (0.4179) time: 5.5826 data: 0.0001 max mem: 71357 -[13:41:05.487770] Epoch: [2] [100/6500] lr: 0.000038 closs: 0.7571 (0.7515) grad_norm: 0.3682 (0.4140) time: 5.5832 data: 0.0001 max mem: 71357 -[13:42:01.187296] Epoch: [2] [110/6500] lr: 0.000038 closs: 0.7371 (0.7502) grad_norm: 0.3937 (0.4187) time: 5.5751 data: 0.0001 max mem: 71357 -[13:42:56.922471] Epoch: [2] [120/6500] lr: 0.000038 closs: 0.7371 (0.7516) grad_norm: 0.3937 (0.4227) time: 5.5716 data: 0.0002 max mem: 71357 -[13:43:52.820057] Epoch: [2] [130/6500] lr: 0.000038 closs: 0.7830 (0.7562) grad_norm: 0.4010 (0.4249) time: 5.5815 data: 0.0002 max mem: 71357 -[13:44:48.624177] Epoch: [2] [140/6500] lr: 0.000038 closs: 0.7541 (0.7543) grad_norm: 0.4010 (0.4211) time: 5.5850 data: 0.0001 max mem: 71357 -[13:45:44.395166] Epoch: [2] [150/6500] lr: 0.000038 closs: 0.6841 (0.7539) grad_norm: 0.3672 (0.4165) time: 5.5787 data: 0.0001 max mem: 71357 -[13:46:40.119569] Epoch: [2] [160/6500] lr: 0.000038 closs: 0.7270 (0.7544) grad_norm: 0.4010 (0.4255) time: 5.5747 data: 0.0001 max mem: 71357 -[13:47:35.890460] Epoch: [2] [170/6500] lr: 0.000038 closs: 0.7027 (0.7530) grad_norm: 0.3672 (0.4220) time: 5.5747 data: 0.0001 max mem: 71357 -[13:48:31.736860] Epoch: [2] [180/6500] lr: 0.000038 closs: 0.7138 (0.7518) grad_norm: 0.3619 (0.4208) time: 5.5808 data: 0.0001 max mem: 71357 -[13:49:27.392286] Epoch: [2] [190/6500] lr: 0.000038 closs: 0.7209 (0.7522) grad_norm: 0.3753 (0.4228) time: 5.5750 data: 0.0001 max mem: 71357 -[13:50:23.153601] Epoch: [2] [200/6500] lr: 0.000038 closs: 0.7229 (0.7506) grad_norm: 0.3753 (0.4260) time: 5.5707 data: 0.0001 max mem: 71357 -[13:51:18.968289] Epoch: [2] [210/6500] lr: 0.000038 closs: 0.7229 (0.7512) grad_norm: 0.4066 (0.4268) time: 5.5787 data: 0.0001 max mem: 71357 -[13:52:14.932241] Epoch: [2] [220/6500] lr: 0.000038 closs: 0.6851 (0.7505) grad_norm: 0.4066 (0.4231) time: 5.5888 data: 0.0001 max mem: 71357 -[13:53:10.610429] Epoch: [2] [230/6500] lr: 0.000038 closs: 0.7820 (0.7548) grad_norm: 0.3845 (0.4203) time: 5.5820 data: 0.0001 max mem: 71357 -[13:54:06.355460] Epoch: [2] [240/6500] lr: 0.000038 closs: 0.7820 (0.7562) grad_norm: 0.3753 (0.4202) time: 5.5711 data: 0.0001 max mem: 71357 -[13:55:02.130045] Epoch: [2] [250/6500] lr: 0.000038 closs: 0.7681 (0.7567) grad_norm: 0.3540 (0.4181) time: 5.5759 data: 0.0001 max mem: 71357 -[13:55:57.941222] Epoch: [2] [260/6500] lr: 0.000038 closs: 0.7585 (0.7572) grad_norm: 0.3630 (0.4196) time: 5.5792 data: 0.0001 max mem: 71357 -[13:56:53.824737] Epoch: [2] [270/6500] lr: 0.000038 closs: 0.7585 (0.7570) grad_norm: 0.3699 (0.4187) time: 5.5847 data: 0.0001 max mem: 71357 -[13:57:49.697171] Epoch: [2] [280/6500] lr: 0.000038 closs: 0.7405 (0.7566) grad_norm: 0.3712 (0.4186) time: 5.5877 data: 0.0001 max mem: 71357 -[13:58:45.437495] Epoch: [2] [290/6500] lr: 0.000038 closs: 0.7335 (0.7546) grad_norm: 0.3789 (0.4163) time: 5.5806 data: 0.0001 max mem: 71357 -[13:59:41.195413] Epoch: [2] [300/6500] lr: 0.000038 closs: 0.6860 (0.7536) grad_norm: 0.3789 (0.4153) time: 5.5748 data: 0.0001 max mem: 71357 -[14:00:37.032270] Epoch: [2] [310/6500] lr: 0.000038 closs: 0.7333 (0.7531) grad_norm: 0.3733 (0.4166) time: 5.5796 data: 0.0001 max mem: 71357 -[14:01:32.744167] Epoch: [2] [320/6500] lr: 0.000038 closs: 0.7333 (0.7524) grad_norm: 0.3825 (0.4177) time: 5.5773 data: 0.0001 max mem: 71357 -[14:02:28.443547] Epoch: [2] [330/6500] lr: 0.000038 closs: 0.7277 (0.7520) grad_norm: 0.3988 (0.4181) time: 5.5705 data: 0.0001 max mem: 71357 -[14:03:24.231708] Epoch: [2] [340/6500] lr: 0.000038 closs: 0.7531 (0.7512) grad_norm: 0.3988 (0.4170) time: 5.5743 data: 0.0001 max mem: 71357 -[14:04:20.006444] Epoch: [2] [350/6500] lr: 0.000038 closs: 0.7179 (0.7498) grad_norm: 0.4000 (0.4180) time: 5.5780 data: 0.0001 max mem: 71357 -[14:05:15.765801] Epoch: [2] [360/6500] lr: 0.000038 closs: 0.6998 (0.7484) grad_norm: 0.3938 (0.4171) time: 5.5766 data: 0.0001 max mem: 71357 -[14:06:11.530058] Epoch: [2] [370/6500] lr: 0.000038 closs: 0.6998 (0.7490) grad_norm: 0.3695 (0.4163) time: 5.5761 data: 0.0001 max mem: 71357 -[14:07:07.345190] Epoch: [2] [380/6500] lr: 0.000038 closs: 0.6916 (0.7479) grad_norm: 0.3695 (0.4148) time: 5.5789 data: 0.0001 max mem: 71357 -[14:08:03.215899] Epoch: [2] [390/6500] lr: 0.000038 closs: 0.7538 (0.7490) grad_norm: 0.3527 (0.4131) time: 5.5842 data: 0.0001 max mem: 71357 -[14:08:59.012783] Epoch: [2] [400/6500] lr: 0.000037 closs: 0.7534 (0.7484) grad_norm: 0.3527 (0.4176) time: 5.5833 data: 0.0001 max mem: 71357 -[14:09:54.780467] Epoch: [2] [410/6500] lr: 0.000037 closs: 0.6848 (0.7468) grad_norm: 0.3885 (0.4227) time: 5.5781 data: 0.0001 max mem: 71357 -[14:10:50.583135] Epoch: [2] [420/6500] lr: 0.000037 closs: 0.7102 (0.7481) grad_norm: 0.4692 (0.4295) time: 5.5784 data: 0.0001 max mem: 71357 -[14:11:46.513039] Epoch: [2] [430/6500] lr: 0.000037 closs: 0.7994 (0.7501) grad_norm: 0.4802 (0.4279) time: 5.5865 data: 0.0002 max mem: 71357 -[14:12:42.399415] Epoch: [2] [440/6500] lr: 0.000037 closs: 0.7675 (0.7508) grad_norm: 0.3810 (0.4284) time: 5.5907 data: 0.0002 max mem: 71357 -[14:13:38.199675] Epoch: [2] [450/6500] lr: 0.000037 closs: 0.7309 (0.7514) grad_norm: 0.3810 (0.4298) time: 5.5842 data: 0.0002 max mem: 71357 -[14:14:34.022210] Epoch: [2] [460/6500] lr: 0.000037 closs: 0.7586 (0.7526) grad_norm: 0.3805 (0.4293) time: 5.5810 data: 0.0001 max mem: 71357 -[14:15:29.816050] Epoch: [2] [470/6500] lr: 0.000037 closs: 0.7586 (0.7531) grad_norm: 0.3805 (0.4281) time: 5.5807 data: 0.0001 max mem: 71357 -[14:16:25.599470] Epoch: [2] [480/6500] lr: 0.000037 closs: 0.7866 (0.7538) grad_norm: 0.3662 (0.4269) time: 5.5788 data: 0.0001 max mem: 71357 -[14:17:21.518261] Epoch: [2] [490/6500] lr: 0.000037 closs: 0.8141 (0.7544) grad_norm: 0.3628 (0.4270) time: 5.5850 data: 0.0001 max mem: 71357 -[14:18:17.320636] Epoch: [2] [500/6500] lr: 0.000037 closs: 0.8202 (0.7549) grad_norm: 0.3662 (0.4269) time: 5.5860 data: 0.0001 max mem: 71357 -[14:19:12.994870] Epoch: [2] [510/6500] lr: 0.000037 closs: 0.7792 (0.7557) grad_norm: 0.4105 (0.4281) time: 5.5737 data: 0.0001 max mem: 71357 -[14:20:08.814286] Epoch: [2] [520/6500] lr: 0.000037 closs: 0.7314 (0.7551) grad_norm: 0.4120 (0.4289) time: 5.5746 data: 0.0001 max mem: 71357 -[14:21:04.579633] Epoch: [2] [530/6500] lr: 0.000037 closs: 0.7105 (0.7545) grad_norm: 0.4370 (0.4294) time: 5.5791 data: 0.0001 max mem: 71357 -[14:22:00.313354] Epoch: [2] [540/6500] lr: 0.000037 closs: 0.6849 (0.7527) grad_norm: 0.4191 (0.4286) time: 5.5749 data: 0.0001 max mem: 71357 -[14:22:56.019516] Epoch: [2] [550/6500] lr: 0.000037 closs: 0.7390 (0.7546) grad_norm: 0.4063 (0.4291) time: 5.5719 data: 0.0001 max mem: 71357 -[14:23:51.755868] Epoch: [2] [560/6500] lr: 0.000037 closs: 0.8036 (0.7556) grad_norm: 0.4064 (0.4301) time: 5.5721 data: 0.0001 max mem: 71357 -[14:24:47.612236] Epoch: [2] [570/6500] lr: 0.000037 closs: 0.7781 (0.7556) grad_norm: 0.4063 (0.4297) time: 5.5795 data: 0.0001 max mem: 71357 -[14:25:43.321070] Epoch: [2] [580/6500] lr: 0.000037 closs: 0.7385 (0.7558) grad_norm: 0.4227 (0.4305) time: 5.5781 data: 0.0001 max mem: 71357 -[14:26:39.029217] Epoch: [2] [590/6500] lr: 0.000037 closs: 0.8222 (0.7575) grad_norm: 0.4322 (0.4309) time: 5.5707 data: 0.0001 max mem: 71357 -[14:27:34.840416] Epoch: [2] [600/6500] lr: 0.000037 closs: 0.8283 (0.7582) grad_norm: 0.4322 (0.4303) time: 5.5759 data: 0.0001 max mem: 71357 -[14:28:30.571734] Epoch: [2] [610/6500] lr: 0.000037 closs: 0.6999 (0.7561) grad_norm: 0.4594 (0.4356) time: 5.5771 data: 0.0001 max mem: 71357 -[14:29:26.385644] Epoch: [2] [620/6500] lr: 0.000037 closs: 0.6217 (0.7547) grad_norm: 0.4041 (0.4344) time: 5.5772 data: 0.0001 max mem: 71357 -[14:30:22.152023] Epoch: [2] [630/6500] lr: 0.000037 closs: 0.6924 (0.7546) grad_norm: 0.3903 (0.4342) time: 5.5789 data: 0.0001 max mem: 71357 -[14:31:17.977329] Epoch: [2] [640/6500] lr: 0.000037 closs: 0.7096 (0.7539) grad_norm: 0.3903 (0.4346) time: 5.5795 data: 0.0001 max mem: 71357 -[14:32:13.809917] Epoch: [2] [650/6500] lr: 0.000037 closs: 0.7096 (0.7538) grad_norm: 0.3726 (0.4336) time: 5.5828 data: 0.0001 max mem: 71357 -[14:33:09.675270] Epoch: [2] [660/6500] lr: 0.000037 closs: 0.7337 (0.7539) grad_norm: 0.3889 (0.4341) time: 5.5848 data: 0.0001 max mem: 71357 -[14:34:05.473554] Epoch: [2] [670/6500] lr: 0.000037 closs: 0.6978 (0.7532) grad_norm: 0.3686 (0.4333) time: 5.5831 data: 0.0001 max mem: 71357 -[14:35:01.331221] Epoch: [2] [680/6500] lr: 0.000037 closs: 0.7212 (0.7526) grad_norm: 0.3643 (0.4318) time: 5.5827 data: 0.0001 max mem: 71357 -[14:35:57.063267] Epoch: [2] [690/6500] lr: 0.000037 closs: 0.7305 (0.7527) grad_norm: 0.3668 (0.4318) time: 5.5794 data: 0.0001 max mem: 71357 -[14:36:52.859279] Epoch: [2] [700/6500] lr: 0.000036 closs: 0.7603 (0.7529) grad_norm: 0.3595 (0.4308) time: 5.5763 data: 0.0001 max mem: 71357 -[14:37:48.751222] Epoch: [2] [710/6500] lr: 0.000036 closs: 0.7380 (0.7525) grad_norm: 0.3590 (0.4302) time: 5.5843 data: 0.0002 max mem: 71357 -[14:38:44.494905] Epoch: [2] [720/6500] lr: 0.000036 closs: 0.7349 (0.7529) grad_norm: 0.3672 (0.4298) time: 5.5817 data: 0.0002 max mem: 71357 -[14:39:40.195246] Epoch: [2] [730/6500] lr: 0.000036 closs: 0.7399 (0.7520) grad_norm: 0.4055 (0.4299) time: 5.5721 data: 0.0001 max mem: 71357 -[14:40:36.044045] Epoch: [2] [740/6500] lr: 0.000036 closs: 0.7879 (0.7533) grad_norm: 0.3741 (0.4294) time: 5.5774 data: 0.0001 max mem: 71357 -[14:41:31.881779] Epoch: [2] [750/6500] lr: 0.000036 closs: 0.8348 (0.7536) grad_norm: 0.4074 (0.4296) time: 5.5843 data: 0.0001 max mem: 71357 -[14:42:27.626745] Epoch: [2] [760/6500] lr: 0.000036 closs: 0.7557 (0.7528) grad_norm: 0.4074 (0.4299) time: 5.5790 data: 0.0001 max mem: 71357 -[14:43:23.427493] Epoch: [2] [770/6500] lr: 0.000036 closs: 0.7330 (0.7528) grad_norm: 0.4074 (0.4302) time: 5.5772 data: 0.0001 max mem: 71357 -[14:44:19.252177] Epoch: [2] [780/6500] lr: 0.000036 closs: 0.7833 (0.7537) grad_norm: 0.4074 (0.4319) time: 5.5812 data: 0.0001 max mem: 71357 -[14:45:15.139740] Epoch: [2] [790/6500] lr: 0.000036 closs: 0.7768 (0.7542) grad_norm: 0.3707 (0.4311) time: 5.5855 data: 0.0001 max mem: 71357 -[14:46:10.918823] Epoch: [2] [800/6500] lr: 0.000036 closs: 0.7081 (0.7539) grad_norm: 0.3902 (0.4308) time: 5.5833 data: 0.0001 max mem: 71357 -[14:47:06.736186] Epoch: [2] [810/6500] lr: 0.000036 closs: 0.7008 (0.7532) grad_norm: 0.3791 (0.4306) time: 5.5797 data: 0.0001 max mem: 71357 -[14:48:02.479681] Epoch: [2] [820/6500] lr: 0.000036 closs: 0.7034 (0.7527) grad_norm: 0.3791 (0.4314) time: 5.5779 data: 0.0002 max mem: 71357 -[14:48:58.271678] Epoch: [2] [830/6500] lr: 0.000036 closs: 0.6858 (0.7525) grad_norm: 0.4105 (0.4314) time: 5.5766 data: 0.0002 max mem: 71357 -[14:49:54.210878] Epoch: [2] [840/6500] lr: 0.000036 closs: 0.7823 (0.7531) grad_norm: 0.4233 (0.4315) time: 5.5865 data: 0.0001 max mem: 71357 -[14:50:49.920972] Epoch: [2] [850/6500] lr: 0.000036 closs: 0.7854 (0.7535) grad_norm: 0.4315 (0.4319) time: 5.5824 data: 0.0001 max mem: 71357 -[14:51:45.745245] Epoch: [2] [860/6500] lr: 0.000036 closs: 0.7603 (0.7538) grad_norm: 0.4249 (0.4585) time: 5.5766 data: 0.0002 max mem: 71357 -[14:52:41.602665] Epoch: [2] [870/6500] lr: 0.000036 closs: 0.7513 (0.7540) grad_norm: 0.4205 (0.4580) time: 5.5840 data: 0.0002 max mem: 71357 -[14:53:37.485162] Epoch: [2] [880/6500] lr: 0.000036 closs: 0.7341 (0.7539) grad_norm: 0.3926 (0.4577) time: 5.5869 data: 0.0002 max mem: 71357 -[14:54:33.281177] Epoch: [2] [890/6500] lr: 0.000036 closs: 0.7098 (0.7535) grad_norm: 0.3598 (0.4576) time: 5.5838 data: 0.0002 max mem: 71357 -[14:55:29.092517] Epoch: [2] [900/6500] lr: 0.000036 closs: 0.7047 (0.7527) grad_norm: 0.3727 (0.4576) time: 5.5802 data: 0.0002 max mem: 71357 -[14:56:24.907964] Epoch: [2] [910/6500] lr: 0.000036 closs: 0.7047 (0.7532) grad_norm: 0.3870 (0.4592) time: 5.5812 data: 0.0002 max mem: 71357 -[14:57:20.783829] Epoch: [2] [920/6500] lr: 0.000036 closs: 0.7544 (0.7532) grad_norm: 0.3975 (0.4588) time: 5.5844 data: 0.0002 max mem: 71357 -[14:58:16.764824] Epoch: [2] [930/6500] lr: 0.000036 closs: 0.7854 (0.7545) grad_norm: 0.3975 (0.4591) time: 5.5927 data: 0.0002 max mem: 71357 -[14:59:12.669502] Epoch: [2] [940/6500] lr: 0.000036 closs: 0.8281 (0.7545) grad_norm: 0.3922 (0.4587) time: 5.5942 data: 0.0002 max mem: 71357 -[15:00:08.554641] Epoch: [2] [950/6500] lr: 0.000036 closs: 0.7504 (0.7542) grad_norm: 0.3922 (0.4584) time: 5.5894 data: 0.0002 max mem: 71357 -[15:01:04.431999] Epoch: [2] [960/6500] lr: 0.000036 closs: 0.7345 (0.7542) grad_norm: 0.3973 (0.4582) time: 5.5880 data: 0.0002 max mem: 71357 -[15:02:00.303487] Epoch: [2] [970/6500] lr: 0.000036 closs: 0.6935 (0.7539) grad_norm: 0.4130 (0.4579) time: 5.5873 data: 0.0002 max mem: 71357 -[15:02:56.106192] Epoch: [2] [980/6500] lr: 0.000036 closs: 0.6935 (0.7538) grad_norm: 0.3919 (0.4570) time: 5.5836 data: 0.0002 max mem: 71357 -[15:03:51.815680] Epoch: [2] [990/6500] lr: 0.000036 closs: 0.6868 (0.7532) grad_norm: 0.3815 (0.4564) time: 5.5755 data: 0.0002 max mem: 71357 -[15:04:47.621183] Epoch: [2] [1000/6500] lr: 0.000035 closs: 0.6118 (0.7523) grad_norm: 0.3674 (0.4557) time: 5.5756 data: 0.0002 max mem: 71357 -[15:05:43.550443] Epoch: [2] [1010/6500] lr: 0.000035 closs: 0.6951 (0.7526) grad_norm: 0.3764 (0.4553) time: 5.5866 data: 0.0002 max mem: 71357 -[15:06:39.399591] Epoch: [2] [1020/6500] lr: 0.000035 closs: 0.7234 (0.7525) grad_norm: 0.3825 (0.4549) time: 5.5888 data: 0.0002 max mem: 71357 -[15:07:35.244881] Epoch: [2] [1030/6500] lr: 0.000035 closs: 0.7264 (0.7526) grad_norm: 0.4198 (0.4550) time: 5.5846 data: 0.0002 max mem: 71357 -[15:08:31.032761] Epoch: [2] [1040/6500] lr: 0.000035 closs: 0.7264 (0.7522) grad_norm: 0.4198 (0.4551) time: 5.5815 data: 0.0002 max mem: 71357 -[15:09:26.908118] Epoch: [2] [1050/6500] lr: 0.000035 closs: 0.7246 (0.7525) grad_norm: 0.4107 (0.4542) time: 5.5830 data: 0.0002 max mem: 71357 -[15:10:22.920376] Epoch: [2] [1060/6500] lr: 0.000035 closs: 0.7357 (0.7525) grad_norm: 0.4051 (0.4537) time: 5.5943 data: 0.0002 max mem: 71357 -[15:11:18.721035] Epoch: [2] [1070/6500] lr: 0.000035 closs: 0.7357 (0.7528) grad_norm: 0.3653 (0.4529) time: 5.5905 data: 0.0002 max mem: 71357 -[15:12:14.492900] Epoch: [2] [1080/6500] lr: 0.000035 closs: 0.7322 (0.7529) grad_norm: 0.3654 (0.4585) time: 5.5785 data: 0.0002 max mem: 71357 -[15:13:10.357333] Epoch: [2] [1090/6500] lr: 0.000035 closs: 0.7251 (0.7528) grad_norm: 0.4259 (0.4585) time: 5.5817 data: 0.0002 max mem: 71357 -[15:14:06.180945] Epoch: [2] [1100/6500] lr: 0.000035 closs: 0.7086 (0.7525) grad_norm: 0.4282 (0.4583) time: 5.5843 data: 0.0002 max mem: 71357 -[15:15:01.964084] Epoch: [2] [1110/6500] lr: 0.000035 closs: 0.6976 (0.7523) grad_norm: 0.4664 (0.4576) time: 5.5802 data: 0.0002 max mem: 71357 -[15:15:57.685490] Epoch: [2] [1120/6500] lr: 0.000035 closs: 0.7384 (0.7523) grad_norm: 0.3756 (0.4574) time: 5.5751 data: 0.0002 max mem: 71357 -[15:16:53.523206] Epoch: [2] [1130/6500] lr: 0.000035 closs: 0.7384 (0.7522) grad_norm: 0.3756 (0.4573) time: 5.5779 data: 0.0002 max mem: 71357 -[15:17:49.360003] Epoch: [2] [1140/6500] lr: 0.000035 closs: 0.7554 (0.7525) grad_norm: 0.3694 (0.4568) time: 5.5836 data: 0.0002 max mem: 71357 -[15:18:45.316724] Epoch: [2] [1150/6500] lr: 0.000035 closs: 0.7964 (0.7526) grad_norm: 0.3718 (0.4563) time: 5.5895 data: 0.0002 max mem: 71357 -[15:19:41.125463] Epoch: [2] [1160/6500] lr: 0.000035 closs: 0.7983 (0.7529) grad_norm: 0.3694 (0.4561) time: 5.5881 data: 0.0002 max mem: 71357 -[15:20:36.889226] Epoch: [2] [1170/6500] lr: 0.000035 closs: 0.7632 (0.7523) grad_norm: 0.3718 (0.4555) time: 5.5785 data: 0.0002 max mem: 71357 -[15:21:32.660197] Epoch: [2] [1180/6500] lr: 0.000035 closs: 0.6888 (0.7523) grad_norm: 0.4002 (0.4550) time: 5.5766 data: 0.0002 max mem: 71357 -[15:22:28.516278] Epoch: [2] [1190/6500] lr: 0.000035 closs: 0.6888 (0.7523) grad_norm: 0.4274 (0.4553) time: 5.5813 data: 0.0002 max mem: 71357 -[15:23:24.346365] Epoch: [2] [1200/6500] lr: 0.000035 closs: 0.7446 (0.7525) grad_norm: 0.4099 (0.4549) time: 5.5842 data: 0.0002 max mem: 71357 -[15:24:20.134239] Epoch: [2] [1210/6500] lr: 0.000035 closs: 0.6904 (0.7520) grad_norm: 0.4104 (0.4544) time: 5.5807 data: 0.0002 max mem: 71357 -[15:25:15.925899] Epoch: [2] [1220/6500] lr: 0.000035 closs: 0.7181 (0.7522) grad_norm: 0.4152 (0.4552) time: 5.5788 data: 0.0002 max mem: 71357 -[15:26:11.888462] Epoch: [2] [1230/6500] lr: 0.000035 closs: 0.7267 (0.7519) grad_norm: 0.4083 (0.4548) time: 5.5876 data: 0.0002 max mem: 71357 -[15:27:07.646299] Epoch: [2] [1240/6500] lr: 0.000035 closs: 0.6987 (0.7515) grad_norm: 0.4104 (0.4549) time: 5.5859 data: 0.0002 max mem: 71357 -[15:28:03.389698] Epoch: [2] [1250/6500] lr: 0.000035 closs: 0.6987 (0.7520) grad_norm: 0.4244 (0.4545) time: 5.5750 data: 0.0002 max mem: 71357 -[15:28:59.282048] Epoch: [2] [1260/6500] lr: 0.000035 closs: 0.6953 (0.7516) grad_norm: 0.4244 (0.4546) time: 5.5816 data: 0.0002 max mem: 71357 -[15:29:55.061908] Epoch: [2] [1270/6500] lr: 0.000035 closs: 0.7070 (0.7511) grad_norm: 0.4485 (0.4550) time: 5.5834 data: 0.0002 max mem: 71357 -[15:30:50.944627] Epoch: [2] [1280/6500] lr: 0.000035 closs: 0.7370 (0.7518) grad_norm: 0.4485 (0.4555) time: 5.5830 data: 0.0002 max mem: 71357 -[15:31:46.666142] Epoch: [2] [1290/6500] lr: 0.000034 closs: 0.7087 (0.7517) grad_norm: 0.4504 (0.4558) time: 5.5801 data: 0.0001 max mem: 71357 -[15:32:42.433467] Epoch: [2] [1300/6500] lr: 0.000034 closs: 0.7037 (0.7513) grad_norm: 0.4323 (0.4552) time: 5.5744 data: 0.0001 max mem: 71357 -[15:33:38.181099] Epoch: [2] [1310/6500] lr: 0.000034 closs: 0.7091 (0.7517) grad_norm: 0.4323 (0.4556) time: 5.5757 data: 0.0002 max mem: 71357 -[15:34:33.994700] Epoch: [2] [1320/6500] lr: 0.000034 closs: 0.7841 (0.7518) grad_norm: 0.4367 (0.4562) time: 5.5780 data: 0.0002 max mem: 71357 -[15:35:29.692112] Epoch: [2] [1330/6500] lr: 0.000034 closs: 0.7856 (0.7518) grad_norm: 0.3952 (0.4559) time: 5.5754 data: 0.0002 max mem: 71357 -[15:36:25.439166] Epoch: [2] [1340/6500] lr: 0.000034 closs: 0.7897 (0.7518) grad_norm: 0.4837 (0.4569) time: 5.5721 data: 0.0001 max mem: 71357 -[15:37:21.352355] Epoch: [2] [1350/6500] lr: 0.000034 closs: 0.7514 (0.7517) grad_norm: 0.4169 (0.4564) time: 5.5829 data: 0.0001 max mem: 71357 -[15:38:17.111671] Epoch: [2] [1360/6500] lr: 0.000034 closs: 0.7093 (0.7515) grad_norm: 0.3850 (0.4564) time: 5.5835 data: 0.0002 max mem: 71357 -[15:39:12.952198] Epoch: [2] [1370/6500] lr: 0.000034 closs: 0.7113 (0.7511) grad_norm: 0.4112 (0.4626) time: 5.5799 data: 0.0002 max mem: 71357 -[15:40:08.662786] Epoch: [2] [1380/6500] lr: 0.000034 closs: 0.7377 (0.7512) grad_norm: 0.3917 (0.4633) time: 5.5775 data: 0.0001 max mem: 71357 -[15:41:04.467847] Epoch: [2] [1390/6500] lr: 0.000034 closs: 0.7426 (0.7512) grad_norm: 0.4009 (0.4635) time: 5.5756 data: 0.0002 max mem: 71357 -[15:42:00.167536] Epoch: [2] [1400/6500] lr: 0.000034 closs: 0.7377 (0.7511) grad_norm: 0.4512 (0.4636) time: 5.5751 data: 0.0002 max mem: 71357 -[15:42:56.023589] Epoch: [2] [1410/6500] lr: 0.000034 closs: 0.7182 (0.7510) grad_norm: 0.4512 (0.4638) time: 5.5777 data: 0.0002 max mem: 71357 -[15:43:51.710783] Epoch: [2] [1420/6500] lr: 0.000034 closs: 0.7622 (0.7516) grad_norm: 0.4390 (0.4632) time: 5.5771 data: 0.0002 max mem: 71357 -[15:44:47.432279] Epoch: [2] [1430/6500] lr: 0.000034 closs: 0.8510 (0.7520) grad_norm: 0.4332 (0.4633) time: 5.5703 data: 0.0002 max mem: 71357 -[15:45:43.110068] Epoch: [2] [1440/6500] lr: 0.000034 closs: 0.8121 (0.7523) grad_norm: 0.4311 (0.4631) time: 5.5698 data: 0.0002 max mem: 71357 -[15:46:39.076188] Epoch: [2] [1450/6500] lr: 0.000034 closs: 0.7827 (0.7524) grad_norm: 0.3811 (0.4628) time: 5.5821 data: 0.0002 max mem: 71357 -[15:47:34.786235] Epoch: [2] [1460/6500] lr: 0.000034 closs: 0.7050 (0.7523) grad_norm: 0.3816 (0.4624) time: 5.5837 data: 0.0002 max mem: 71357 -[15:48:30.542459] Epoch: [2] [1470/6500] lr: 0.000034 closs: 0.7427 (0.7526) grad_norm: 0.3891 (0.4620) time: 5.5732 data: 0.0001 max mem: 71357 -[15:49:26.323343] Epoch: [2] [1480/6500] lr: 0.000034 closs: 0.7493 (0.7524) grad_norm: 0.3891 (0.4615) time: 5.5767 data: 0.0001 max mem: 71357 -[15:50:22.063916] Epoch: [2] [1490/6500] lr: 0.000034 closs: 0.7493 (0.7529) grad_norm: 0.4173 (0.4618) time: 5.5759 data: 0.0001 max mem: 71357 -[15:51:17.879657] Epoch: [2] [1500/6500] lr: 0.000034 closs: 0.7679 (0.7528) grad_norm: 0.4173 (0.4612) time: 5.5777 data: 0.0001 max mem: 71357 -[15:52:13.600143] Epoch: [2] [1510/6500] lr: 0.000034 closs: 0.7527 (0.7528) grad_norm: 0.4107 (0.4608) time: 5.5767 data: 0.0001 max mem: 71357 -[15:53:09.349932] Epoch: [2] [1520/6500] lr: 0.000034 closs: 0.7025 (0.7528) grad_norm: 0.4178 (0.4605) time: 5.5734 data: 0.0002 max mem: 71357 -[15:54:05.153207] Epoch: [2] [1530/6500] lr: 0.000034 closs: 0.6863 (0.7526) grad_norm: 0.3977 (0.4603) time: 5.5776 data: 0.0002 max mem: 71357 -[15:55:01.034381] Epoch: [2] [1540/6500] lr: 0.000034 closs: 0.6656 (0.7521) grad_norm: 0.3977 (0.4598) time: 5.5841 data: 0.0002 max mem: 71357 -[15:55:56.795048] Epoch: [2] [1550/6500] lr: 0.000034 closs: 0.6656 (0.7522) grad_norm: 0.3977 (0.4595) time: 5.5820 data: 0.0002 max mem: 71357 -[15:56:52.611293] Epoch: [2] [1560/6500] lr: 0.000034 closs: 0.7849 (0.7525) grad_norm: 0.3977 (0.4592) time: 5.5787 data: 0.0001 max mem: 71357 -[15:57:48.402966] Epoch: [2] [1570/6500] lr: 0.000034 closs: 0.7785 (0.7525) grad_norm: 0.4062 (0.4590) time: 5.5803 data: 0.0001 max mem: 71357 -[15:58:44.217639] Epoch: [2] [1580/6500] lr: 0.000033 closs: 0.7870 (0.7531) grad_norm: 0.4106 (0.4587) time: 5.5802 data: 0.0001 max mem: 71357 -[15:59:40.002405] Epoch: [2] [1590/6500] lr: 0.000033 closs: 0.7723 (0.7532) grad_norm: 0.4159 (0.4585) time: 5.5799 data: 0.0001 max mem: 71357 -[16:00:35.721907] Epoch: [2] [1600/6500] lr: 0.000033 closs: 0.6743 (0.7524) grad_norm: 0.4302 (0.4586) time: 5.5751 data: 0.0001 max mem: 71357 -[16:01:31.458032] Epoch: [2] [1610/6500] lr: 0.000033 closs: 0.6607 (0.7525) grad_norm: 0.4363 (0.4585) time: 5.5727 data: 0.0001 max mem: 71357 -[16:02:27.297980] Epoch: [2] [1620/6500] lr: 0.000033 closs: 0.7730 (0.7526) grad_norm: 0.4302 (0.4584) time: 5.5787 data: 0.0001 max mem: 71357 -[16:03:23.124468] Epoch: [2] [1630/6500] lr: 0.000033 closs: 0.7734 (0.7526) grad_norm: 0.4404 (0.4611) time: 5.5832 data: 0.0001 max mem: 71357 -[16:04:18.899383] Epoch: [2] [1640/6500] lr: 0.000033 closs: 0.7204 (0.7525) grad_norm: 0.4069 (0.4606) time: 5.5800 data: 0.0001 max mem: 71357 -[16:05:14.642459] Epoch: [2] [1650/6500] lr: 0.000033 closs: 0.7107 (0.7522) grad_norm: 0.3977 (0.4603) time: 5.5758 data: 0.0001 max mem: 71357 -[16:06:10.346118] Epoch: [2] [1660/6500] lr: 0.000033 closs: 0.6985 (0.7520) grad_norm: 0.3828 (0.4600) time: 5.5722 data: 0.0001 max mem: 71357 -[16:07:06.260484] Epoch: [2] [1670/6500] lr: 0.000033 closs: 0.7301 (0.7522) grad_norm: 0.3594 (0.4594) time: 5.5808 data: 0.0001 max mem: 71357 -[16:08:01.979004] Epoch: [2] [1680/6500] lr: 0.000033 closs: 0.7341 (0.7519) grad_norm: 0.3670 (0.4596) time: 5.5816 data: 0.0001 max mem: 71357 -[16:08:57.731380] Epoch: [2] [1690/6500] lr: 0.000033 closs: 0.7382 (0.7520) grad_norm: 0.3670 (0.4593) time: 5.5735 data: 0.0001 max mem: 71357 -[16:09:53.536263] Epoch: [2] [1700/6500] lr: 0.000033 closs: 0.7382 (0.7517) grad_norm: 0.3817 (0.4593) time: 5.5778 data: 0.0001 max mem: 71357 -[16:10:49.268849] Epoch: [2] [1710/6500] lr: 0.000033 closs: 0.7031 (0.7516) grad_norm: 0.4069 (0.4592) time: 5.5768 data: 0.0001 max mem: 71357 -[16:11:45.090787] Epoch: [2] [1720/6500] lr: 0.000033 closs: 0.7533 (0.7522) grad_norm: 0.3817 (0.4587) time: 5.5776 data: 0.0001 max mem: 71357 -[16:12:40.803323] Epoch: [2] [1730/6500] lr: 0.000033 closs: 0.7173 (0.7521) grad_norm: 0.4069 (0.4589) time: 5.5766 data: 0.0001 max mem: 71357 -[16:13:36.587002] Epoch: [2] [1740/6500] lr: 0.000033 closs: 0.6879 (0.7516) grad_norm: 0.3828 (0.4585) time: 5.5747 data: 0.0002 max mem: 71357 -[16:14:32.418319] Epoch: [2] [1750/6500] lr: 0.000033 closs: 0.6997 (0.7518) grad_norm: 0.3686 (0.4582) time: 5.5807 data: 0.0002 max mem: 71357 -[16:15:28.290997] Epoch: [2] [1760/6500] lr: 0.000033 closs: 0.7878 (0.7520) grad_norm: 0.3828 (0.4587) time: 5.5851 data: 0.0001 max mem: 71357 -[16:16:24.081054] Epoch: [2] [1770/6500] lr: 0.000033 closs: 0.8214 (0.7525) grad_norm: 0.3686 (0.4582) time: 5.5830 data: 0.0001 max mem: 71357 -[16:17:19.820691] Epoch: [2] [1780/6500] lr: 0.000033 closs: 0.7482 (0.7530) grad_norm: 0.3871 (0.4579) time: 5.5764 data: 0.0001 max mem: 71357 -[16:18:15.530633] Epoch: [2] [1790/6500] lr: 0.000033 closs: 0.7315 (0.7531) grad_norm: 0.4017 (0.4577) time: 5.5724 data: 0.0002 max mem: 71357 -[16:19:11.342479] Epoch: [2] [1800/6500] lr: 0.000033 closs: 0.7292 (0.7527) grad_norm: 0.4017 (0.4573) time: 5.5760 data: 0.0002 max mem: 71357 -[16:20:07.123117] Epoch: [2] [1810/6500] lr: 0.000033 closs: 0.7292 (0.7532) grad_norm: 0.4125 (0.4572) time: 5.5796 data: 0.0001 max mem: 71357 -[16:21:02.878153] Epoch: [2] [1820/6500] lr: 0.000033 closs: 0.8183 (0.7531) grad_norm: 0.4125 (0.4570) time: 5.5767 data: 0.0001 max mem: 71357 -[16:21:58.572732] Epoch: [2] [1830/6500] lr: 0.000033 closs: 0.7561 (0.7535) grad_norm: 0.4260 (0.4570) time: 5.5724 data: 0.0001 max mem: 71357 -[16:22:54.333135] Epoch: [2] [1840/6500] lr: 0.000033 closs: 0.7238 (0.7532) grad_norm: 0.4551 (0.4570) time: 5.5726 data: 0.0001 max mem: 71357 -[16:23:50.182133] Epoch: [2] [1850/6500] lr: 0.000033 closs: 0.7363 (0.7533) grad_norm: 0.4112 (0.4567) time: 5.5804 data: 0.0001 max mem: 71357 -[16:24:45.956038] Epoch: [2] [1860/6500] lr: 0.000032 closs: 0.7864 (0.7535) grad_norm: 0.4024 (0.4560) time: 5.5811 data: 0.0001 max mem: 71357 -[16:25:41.681321] Epoch: [2] [1870/6500] lr: 0.000032 closs: 0.7445 (0.7531) grad_norm: 0.3907 (0.4558) time: 5.5749 data: 0.0001 max mem: 71357 -[16:26:37.476320] Epoch: [2] [1880/6500] lr: 0.000032 closs: 0.7359 (0.7534) grad_norm: 0.3884 (0.4627) time: 5.5759 data: 0.0001 max mem: 71357 -[16:27:33.312302] Epoch: [2] [1890/6500] lr: 0.000032 closs: 0.6661 (0.7528) grad_norm: 0.3852 (0.4630) time: 5.5814 data: 0.0001 max mem: 71357 -[16:28:29.226442] Epoch: [2] [1900/6500] lr: 0.000032 closs: 0.7272 (0.7528) grad_norm: 0.3907 (0.4624) time: 5.5874 data: 0.0002 max mem: 71357 -[16:29:24.933778] Epoch: [2] [1910/6500] lr: 0.000032 closs: 0.7816 (0.7531) grad_norm: 0.3694 (0.4621) time: 5.5810 data: 0.0001 max mem: 71357 -[16:30:20.642646] Epoch: [2] [1920/6500] lr: 0.000032 closs: 0.7881 (0.7530) grad_norm: 0.3694 (0.4622) time: 5.5707 data: 0.0001 max mem: 71357 -[16:31:16.469179] Epoch: [2] [1930/6500] lr: 0.000032 closs: 0.7151 (0.7529) grad_norm: 0.3642 (0.4620) time: 5.5767 data: 0.0001 max mem: 71357 -[16:32:12.233790] Epoch: [2] [1940/6500] lr: 0.000032 closs: 0.7240 (0.7527) grad_norm: 0.3498 (0.4614) time: 5.5794 data: 0.0001 max mem: 71357 -[16:33:07.938858] Epoch: [2] [1950/6500] lr: 0.000032 closs: 0.7272 (0.7528) grad_norm: 0.3489 (0.4612) time: 5.5733 data: 0.0002 max mem: 71357 -[16:34:03.698037] Epoch: [2] [1960/6500] lr: 0.000032 closs: 0.7917 (0.7535) grad_norm: 0.3489 (0.4617) time: 5.5731 data: 0.0002 max mem: 71357 -[16:34:59.459082] Epoch: [2] [1970/6500] lr: 0.000032 closs: 0.7833 (0.7534) grad_norm: 0.3513 (0.4616) time: 5.5759 data: 0.0001 max mem: 71357 -[16:35:55.325722] Epoch: [2] [1980/6500] lr: 0.000032 closs: 0.7259 (0.7530) grad_norm: 0.3869 (0.4612) time: 5.5813 data: 0.0001 max mem: 71357 -[16:36:51.047078] Epoch: [2] [1990/6500] lr: 0.000032 closs: 0.7144 (0.7527) grad_norm: 0.3811 (0.4608) time: 5.5793 data: 0.0001 max mem: 71357 -[16:37:46.785724] Epoch: [2] [2000/6500] lr: 0.000032 closs: 0.7638 (0.7531) grad_norm: 0.3811 (0.4604) time: 5.5729 data: 0.0001 max mem: 71357 -[16:38:42.646345] Epoch: [2] [2010/6500] lr: 0.000032 closs: 0.8048 (0.7533) grad_norm: 0.3741 (0.4601) time: 5.5798 data: 0.0002 max mem: 71357 -[16:39:38.375405] Epoch: [2] [2020/6500] lr: 0.000032 closs: 0.7819 (0.7533) grad_norm: 0.3729 (0.4600) time: 5.5793 data: 0.0002 max mem: 71357 -[16:40:34.271946] Epoch: [2] [2030/6500] lr: 0.000032 closs: 0.7338 (0.7533) grad_norm: 0.3709 (0.4597) time: 5.5812 data: 0.0001 max mem: 71357 -[16:41:29.947289] Epoch: [2] [2040/6500] lr: 0.000032 closs: 0.7511 (0.7535) grad_norm: 0.3709 (0.4593) time: 5.5785 data: 0.0001 max mem: 71357 -[16:42:25.641146] Epoch: [2] [2050/6500] lr: 0.000032 closs: 0.7587 (0.7534) grad_norm: 0.3732 (0.4592) time: 5.5683 data: 0.0001 max mem: 71357 -[16:43:21.404041] Epoch: [2] [2060/6500] lr: 0.000032 closs: 0.7396 (0.7533) grad_norm: 0.4092 (0.4594) time: 5.5727 data: 0.0002 max mem: 71357 -[16:44:17.307464] Epoch: [2] [2070/6500] lr: 0.000032 closs: 0.7244 (0.7532) grad_norm: 0.4281 (0.4594) time: 5.5832 data: 0.0002 max mem: 71357 -[16:45:13.098288] Epoch: [2] [2080/6500] lr: 0.000032 closs: 0.7375 (0.7533) grad_norm: 0.4281 (0.4592) time: 5.5846 data: 0.0001 max mem: 71357 -[16:46:08.858975] Epoch: [2] [2090/6500] lr: 0.000032 closs: 0.7613 (0.7531) grad_norm: 0.4259 (0.4589) time: 5.5774 data: 0.0001 max mem: 71357 -[16:47:04.577290] Epoch: [2] [2100/6500] lr: 0.000032 closs: 0.7005 (0.7532) grad_norm: 0.4049 (0.4588) time: 5.5738 data: 0.0002 max mem: 71357 -[16:48:00.432916] Epoch: [2] [2110/6500] lr: 0.000032 closs: 0.8067 (0.7535) grad_norm: 0.4049 (0.4590) time: 5.5786 data: 0.0002 max mem: 71357 -[16:48:56.200449] Epoch: [2] [2120/6500] lr: 0.000032 closs: 0.7533 (0.7535) grad_norm: 0.3871 (0.4590) time: 5.5810 data: 0.0001 max mem: 71357 -[16:49:51.959520] Epoch: [2] [2130/6500] lr: 0.000032 closs: 0.7356 (0.7535) grad_norm: 0.4249 (0.4589) time: 5.5762 data: 0.0001 max mem: 71357 -[16:50:47.683973] Epoch: [2] [2140/6500] lr: 0.000032 closs: 0.7356 (0.7534) grad_norm: 0.4513 (0.4592) time: 5.5741 data: 0.0001 max mem: 71357 -[16:51:43.510769] Epoch: [2] [2150/6500] lr: 0.000031 closs: 0.7314 (0.7532) grad_norm: 0.4494 (0.4594) time: 5.5775 data: 0.0002 max mem: 71357 -[16:52:39.483174] Epoch: [2] [2160/6500] lr: 0.000031 closs: 0.7314 (0.7532) grad_norm: 0.4494 (0.4591) time: 5.5898 data: 0.0002 max mem: 71357 -[16:53:35.190269] Epoch: [2] [2170/6500] lr: 0.000031 closs: 0.7144 (0.7529) grad_norm: 0.4850 (0.4600) time: 5.5839 data: 0.0002 max mem: 71357 -[16:54:30.969262] Epoch: [2] [2180/6500] lr: 0.000031 closs: 0.6895 (0.7527) grad_norm: 0.4358 (0.4601) time: 5.5742 data: 0.0002 max mem: 71357 -[16:55:26.737443] Epoch: [2] [2190/6500] lr: 0.000031 closs: 0.8195 (0.7531) grad_norm: 0.4794 (0.4601) time: 5.5773 data: 0.0001 max mem: 71357 -[16:56:22.693351] Epoch: [2] [2200/6500] lr: 0.000031 closs: 0.8195 (0.7531) grad_norm: 0.4243 (0.4596) time: 5.5861 data: 0.0001 max mem: 71357 -[16:57:18.527056] Epoch: [2] [2210/6500] lr: 0.000031 closs: 0.7547 (0.7532) grad_norm: 0.3739 (0.4592) time: 5.5894 data: 0.0001 max mem: 71357 -[16:58:14.196064] Epoch: [2] [2220/6500] lr: 0.000031 closs: 0.8415 (0.7538) grad_norm: 0.3739 (0.4589) time: 5.5750 data: 0.0001 max mem: 71357 -[16:59:09.869300] Epoch: [2] [2230/6500] lr: 0.000031 closs: 0.8448 (0.7541) grad_norm: 0.3739 (0.4590) time: 5.5670 data: 0.0001 max mem: 71357 -[17:00:05.674412] Epoch: [2] [2240/6500] lr: 0.000031 closs: 0.6850 (0.7535) grad_norm: 0.4223 (0.4592) time: 5.5738 data: 0.0001 max mem: 71357 -[17:01:01.574772] Epoch: [2] [2250/6500] lr: 0.000031 closs: 0.6532 (0.7533) grad_norm: 0.4223 (0.4587) time: 5.5852 data: 0.0001 max mem: 71357 -[17:01:57.342997] Epoch: [2] [2260/6500] lr: 0.000031 closs: 0.7523 (0.7535) grad_norm: 0.4039 (0.4583) time: 5.5834 data: 0.0001 max mem: 71357 -[17:02:53.157781] Epoch: [2] [2270/6500] lr: 0.000031 closs: 0.7387 (0.7535) grad_norm: 0.3563 (0.4579) time: 5.5791 data: 0.0001 max mem: 71357 -[17:03:48.964752] Epoch: [2] [2280/6500] lr: 0.000031 closs: 0.7139 (0.7531) grad_norm: 0.3563 (0.4580) time: 5.5810 data: 0.0002 max mem: 71357 -[17:04:44.707350] Epoch: [2] [2290/6500] lr: 0.000031 closs: 0.7008 (0.7529) grad_norm: 0.3878 (0.4583) time: 5.5774 data: 0.0002 max mem: 71357 -[17:05:40.431226] Epoch: [2] [2300/6500] lr: 0.000031 closs: 0.7468 (0.7534) grad_norm: 0.4123 (0.4584) time: 5.5732 data: 0.0001 max mem: 71357 -[17:06:36.202782] Epoch: [2] [2310/6500] lr: 0.000031 closs: 0.7475 (0.7532) grad_norm: 0.4419 (0.4583) time: 5.5747 data: 0.0001 max mem: 71357 -[17:07:31.949877] Epoch: [2] [2320/6500] lr: 0.000031 closs: 0.7278 (0.7532) grad_norm: 0.4416 (0.4583) time: 5.5759 data: 0.0001 max mem: 71357 -[17:08:27.811744] Epoch: [2] [2330/6500] lr: 0.000031 closs: 0.7511 (0.7532) grad_norm: 0.4123 (0.4579) time: 5.5804 data: 0.0002 max mem: 71357 -[17:09:23.480954] Epoch: [2] [2340/6500] lr: 0.000031 closs: 0.6838 (0.7529) grad_norm: 0.3687 (0.4580) time: 5.5764 data: 0.0002 max mem: 71357 -[17:10:19.161065] Epoch: [2] [2350/6500] lr: 0.000031 closs: 0.6708 (0.7526) grad_norm: 0.4002 (0.4580) time: 5.5673 data: 0.0001 max mem: 71357 -[17:11:15.085404] Epoch: [2] [2360/6500] lr: 0.000031 closs: 0.7101 (0.7525) grad_norm: 0.3574 (0.4576) time: 5.5801 data: 0.0001 max mem: 71357 -[17:12:10.933953] Epoch: [2] [2370/6500] lr: 0.000031 closs: 0.6778 (0.7522) grad_norm: 0.3789 (0.4578) time: 5.5886 data: 0.0001 max mem: 71357 -[17:13:06.743314] Epoch: [2] [2380/6500] lr: 0.000031 closs: 0.6736 (0.7522) grad_norm: 0.3789 (0.4583) time: 5.5828 data: 0.0002 max mem: 71357 -[17:14:02.607812] Epoch: [2] [2390/6500] lr: 0.000031 closs: 0.7213 (0.7523) grad_norm: 0.3567 (0.4582) time: 5.5836 data: 0.0002 max mem: 71357 -[17:14:58.400102] Epoch: [2] [2400/6500] lr: 0.000031 closs: 0.7723 (0.7525) grad_norm: 0.4134 (0.4579) time: 5.5827 data: 0.0002 max mem: 71357 -[17:15:54.228655] Epoch: [2] [2410/6500] lr: 0.000031 closs: 0.7646 (0.7522) grad_norm: 0.4042 (0.4577) time: 5.5809 data: 0.0002 max mem: 71357 -[17:16:50.097920] Epoch: [2] [2420/6500] lr: 0.000030 closs: 0.7181 (0.7523) grad_norm: 0.4134 (0.4580) time: 5.5848 data: 0.0002 max mem: 71357 -[17:17:45.947083] Epoch: [2] [2430/6500] lr: 0.000030 closs: 0.7181 (0.7522) grad_norm: 0.4134 (0.4578) time: 5.5858 data: 0.0002 max mem: 71357 -[17:18:41.722068] Epoch: [2] [2440/6500] lr: 0.000030 closs: 0.6543 (0.7520) grad_norm: 0.3858 (0.4578) time: 5.5811 data: 0.0002 max mem: 71357 -[17:19:37.404521] Epoch: [2] [2450/6500] lr: 0.000030 closs: 0.7162 (0.7520) grad_norm: 0.4227 (0.4576) time: 5.5728 data: 0.0001 max mem: 71357 -[17:20:33.253044] Epoch: [2] [2460/6500] lr: 0.000030 closs: 0.7169 (0.7519) grad_norm: 0.3806 (0.4578) time: 5.5764 data: 0.0001 max mem: 71357 -[17:21:29.076746] Epoch: [2] [2470/6500] lr: 0.000030 closs: 0.7169 (0.7521) grad_norm: 0.3771 (0.4575) time: 5.5835 data: 0.0001 max mem: 71357 -[17:22:24.780924] Epoch: [2] [2480/6500] lr: 0.000030 closs: 0.7095 (0.7519) grad_norm: 0.3961 (0.4574) time: 5.5763 data: 0.0001 max mem: 71357 -[17:23:20.609490] Epoch: [2] [2490/6500] lr: 0.000030 closs: 0.6672 (0.7515) grad_norm: 0.3961 (0.4572) time: 5.5765 data: 0.0002 max mem: 71357 -[17:24:16.427929] Epoch: [2] [2500/6500] lr: 0.000030 closs: 0.6893 (0.7514) grad_norm: 0.4139 (0.4583) time: 5.5823 data: 0.0002 max mem: 71357 -[17:25:12.307447] Epoch: [2] [2510/6500] lr: 0.000030 closs: 0.7710 (0.7517) grad_norm: 0.4430 (0.4583) time: 5.5848 data: 0.0002 max mem: 71357 -[17:26:08.087180] Epoch: [2] [2520/6500] lr: 0.000030 closs: 0.7870 (0.7520) grad_norm: 0.4261 (0.4582) time: 5.5828 data: 0.0002 max mem: 71357 -[17:27:03.896747] Epoch: [2] [2530/6500] lr: 0.000030 closs: 0.7789 (0.7522) grad_norm: 0.4251 (0.4578) time: 5.5793 data: 0.0002 max mem: 71357 -[17:27:59.671442] Epoch: [2] [2540/6500] lr: 0.000030 closs: 0.7912 (0.7527) grad_norm: 0.4251 (0.4584) time: 5.5791 data: 0.0001 max mem: 71357 -[17:28:55.523823] Epoch: [2] [2550/6500] lr: 0.000030 closs: 0.7942 (0.7527) grad_norm: 0.4170 (0.4583) time: 5.5813 data: 0.0001 max mem: 71357 -[17:29:51.350447] Epoch: [2] [2560/6500] lr: 0.000030 closs: 0.7354 (0.7527) grad_norm: 0.4065 (0.4579) time: 5.5839 data: 0.0001 max mem: 71357 -[17:30:47.189924] Epoch: [2] [2570/6500] lr: 0.000030 closs: 0.7708 (0.7528) grad_norm: 0.4130 (0.4578) time: 5.5832 data: 0.0001 max mem: 71357 -[17:31:42.914037] Epoch: [2] [2580/6500] lr: 0.000030 closs: 0.7749 (0.7529) grad_norm: 0.4057 (0.4577) time: 5.5781 data: 0.0001 max mem: 71357 -[17:32:38.776742] Epoch: [2] [2590/6500] lr: 0.000030 closs: 0.7551 (0.7528) grad_norm: 0.3975 (0.4582) time: 5.5792 data: 0.0001 max mem: 71357 -[17:33:34.669070] Epoch: [2] [2600/6500] lr: 0.000030 closs: 0.7472 (0.7528) grad_norm: 0.3958 (0.4579) time: 5.5876 data: 0.0002 max mem: 71357 -[17:34:30.494585] Epoch: [2] [2610/6500] lr: 0.000030 closs: 0.7444 (0.7528) grad_norm: 0.3791 (0.4578) time: 5.5858 data: 0.0002 max mem: 71357 -[17:35:26.217992] Epoch: [2] [2620/6500] lr: 0.000030 closs: 0.7376 (0.7526) grad_norm: 0.4022 (0.4580) time: 5.5774 data: 0.0002 max mem: 71357 -[17:36:21.989461] Epoch: [2] [2630/6500] lr: 0.000030 closs: 0.6735 (0.7525) grad_norm: 0.3811 (0.4577) time: 5.5746 data: 0.0002 max mem: 71357 -[17:37:17.875799] Epoch: [2] [2640/6500] lr: 0.000030 closs: 0.7215 (0.7526) grad_norm: 0.4032 (0.4577) time: 5.5827 data: 0.0002 max mem: 71357 -[17:38:13.618548] Epoch: [2] [2650/6500] lr: 0.000030 closs: 0.7928 (0.7529) grad_norm: 0.4032 (0.4575) time: 5.5813 data: 0.0002 max mem: 71357 -[17:39:09.409027] Epoch: [2] [2660/6500] lr: 0.000030 closs: 0.7808 (0.7528) grad_norm: 0.3757 (0.4571) time: 5.5765 data: 0.0002 max mem: 71357 -[17:40:05.267929] Epoch: [2] [2670/6500] lr: 0.000030 closs: 0.6744 (0.7524) grad_norm: 0.3697 (0.4571) time: 5.5823 data: 0.0002 max mem: 71357 -[17:41:01.187074] Epoch: [2] [2680/6500] lr: 0.000030 closs: 0.6744 (0.7522) grad_norm: 0.3697 (0.4574) time: 5.5888 data: 0.0002 max mem: 71357 -[17:41:57.168637] Epoch: [2] [2690/6500] lr: 0.000030 closs: 0.7051 (0.7521) grad_norm: 0.3697 (0.4577) time: 5.5949 data: 0.0002 max mem: 71357 -[17:42:53.038755] Epoch: [2] [2700/6500] lr: 0.000029 closs: 0.6676 (0.7519) grad_norm: 0.3712 (0.4573) time: 5.5924 data: 0.0002 max mem: 71357 -[17:43:48.838773] Epoch: [2] [2710/6500] lr: 0.000029 closs: 0.6929 (0.7518) grad_norm: 0.3499 (0.4569) time: 5.5834 data: 0.0002 max mem: 71357 -[17:44:44.674479] Epoch: [2] [2720/6500] lr: 0.000029 closs: 0.7400 (0.7517) grad_norm: 0.3712 (0.4572) time: 5.5816 data: 0.0002 max mem: 71357 -[17:45:40.590527] Epoch: [2] [2730/6500] lr: 0.000029 closs: 0.7642 (0.7517) grad_norm: 0.3458 (0.4568) time: 5.5874 data: 0.0002 max mem: 71357 -[17:46:36.441104] Epoch: [2] [2740/6500] lr: 0.000029 closs: 0.7642 (0.7517) grad_norm: 0.3688 (0.4567) time: 5.5882 data: 0.0002 max mem: 71357 -[17:47:32.267248] Epoch: [2] [2750/6500] lr: 0.000029 closs: 0.7494 (0.7519) grad_norm: 0.3896 (0.4564) time: 5.5837 data: 0.0002 max mem: 71357 -[17:48:28.031604] Epoch: [2] [2760/6500] lr: 0.000029 closs: 0.7593 (0.7519) grad_norm: 0.3661 (0.4562) time: 5.5793 data: 0.0002 max mem: 71357 -[17:49:24.001258] Epoch: [2] [2770/6500] lr: 0.000029 closs: 0.7320 (0.7518) grad_norm: 0.3661 (0.4561) time: 5.5866 data: 0.0002 max mem: 71357 -[17:50:19.737391] Epoch: [2] [2780/6500] lr: 0.000029 closs: 0.7929 (0.7520) grad_norm: 0.3975 (0.4562) time: 5.5852 data: 0.0002 max mem: 71357 -[17:51:15.562224] Epoch: [2] [2790/6500] lr: 0.000029 closs: 0.7360 (0.7517) grad_norm: 0.4177 (0.4565) time: 5.5779 data: 0.0002 max mem: 71357 -[17:52:11.360869] Epoch: [2] [2800/6500] lr: 0.000029 closs: 0.7360 (0.7518) grad_norm: 0.4483 (0.4565) time: 5.5811 data: 0.0002 max mem: 71357 -[17:53:07.203956] Epoch: [2] [2810/6500] lr: 0.000029 closs: 0.7754 (0.7520) grad_norm: 0.4330 (0.4563) time: 5.5820 data: 0.0002 max mem: 71357 -[17:54:03.137654] Epoch: [2] [2820/6500] lr: 0.000029 closs: 0.7180 (0.7520) grad_norm: 0.4330 (0.4559) time: 5.5887 data: 0.0002 max mem: 71357 -[17:54:58.848050] Epoch: [2] [2830/6500] lr: 0.000029 closs: 0.7873 (0.7525) grad_norm: 0.3781 (0.4559) time: 5.5821 data: 0.0002 max mem: 71357 -[17:55:54.655523] Epoch: [2] [2840/6500] lr: 0.000029 closs: 0.7809 (0.7522) grad_norm: 0.3781 (0.4558) time: 5.5757 data: 0.0002 max mem: 71357 -[17:56:50.438313] Epoch: [2] [2850/6500] lr: 0.000029 closs: 0.7091 (0.7522) grad_norm: 0.4116 (0.4562) time: 5.5794 data: 0.0002 max mem: 71357 -[17:57:46.308543] Epoch: [2] [2860/6500] lr: 0.000029 closs: 0.7401 (0.7521) grad_norm: 0.4116 (0.4559) time: 5.5825 data: 0.0002 max mem: 71357 -[17:58:42.069918] Epoch: [2] [2870/6500] lr: 0.000029 closs: 0.7651 (0.7522) grad_norm: 0.4116 (0.4559) time: 5.5814 data: 0.0002 max mem: 71357 -[17:59:37.920237] Epoch: [2] [2880/6500] lr: 0.000029 closs: 0.6953 (0.7519) grad_norm: 0.3913 (0.4556) time: 5.5805 data: 0.0002 max mem: 71357 -[18:00:33.763842] Epoch: [2] [2890/6500] lr: 0.000029 closs: 0.6868 (0.7522) grad_norm: 0.3913 (0.4557) time: 5.5846 data: 0.0002 max mem: 71357 -[18:01:29.628889] Epoch: [2] [2900/6500] lr: 0.000029 closs: 0.6978 (0.7521) grad_norm: 0.3972 (0.4556) time: 5.5853 data: 0.0002 max mem: 71357 -[18:02:25.459492] Epoch: [2] [2910/6500] lr: 0.000029 closs: 0.7243 (0.7522) grad_norm: 0.3972 (0.4556) time: 5.5846 data: 0.0002 max mem: 71357 -[18:03:21.321438] Epoch: [2] [2920/6500] lr: 0.000029 closs: 0.8076 (0.7526) grad_norm: 0.3893 (0.4553) time: 5.5845 data: 0.0002 max mem: 71357 -[18:04:17.173694] Epoch: [2] [2930/6500] lr: 0.000029 closs: 0.8087 (0.7527) grad_norm: 0.3737 (0.4552) time: 5.5855 data: 0.0002 max mem: 71357 -[18:05:13.069197] Epoch: [2] [2940/6500] lr: 0.000029 closs: 0.7949 (0.7529) grad_norm: 0.3737 (0.4549) time: 5.5872 data: 0.0002 max mem: 71357 -[18:06:08.996669] Epoch: [2] [2950/6500] lr: 0.000029 closs: 0.7837 (0.7529) grad_norm: 0.3866 (0.4548) time: 5.5910 data: 0.0002 max mem: 71357 -[18:07:04.827409] Epoch: [2] [2960/6500] lr: 0.000029 closs: 0.7392 (0.7529) grad_norm: 0.4097 (0.4546) time: 5.5878 data: 0.0002 max mem: 71357 -[18:08:00.615811] Epoch: [2] [2970/6500] lr: 0.000029 closs: 0.6962 (0.7528) grad_norm: 0.3931 (0.4544) time: 5.5808 data: 0.0002 max mem: 71357 -[18:08:56.463465] Epoch: [2] [2980/6500] lr: 0.000028 closs: 0.6962 (0.7526) grad_norm: 0.3931 (0.4542) time: 5.5816 data: 0.0002 max mem: 71357 -[18:09:52.439453] Epoch: [2] [2990/6500] lr: 0.000028 closs: 0.6416 (0.7524) grad_norm: 0.3694 (0.4540) time: 5.5910 data: 0.0002 max mem: 71357 -[18:10:48.280738] Epoch: [2] [3000/6500] lr: 0.000028 closs: 0.7245 (0.7525) grad_norm: 0.3694 (0.4538) time: 5.5907 data: 0.0002 max mem: 71357 -[18:11:44.003007] Epoch: [2] [3010/6500] lr: 0.000028 closs: 0.7370 (0.7526) grad_norm: 0.3666 (0.4538) time: 5.5781 data: 0.0002 max mem: 71357 -[18:12:39.782636] Epoch: [2] [3020/6500] lr: 0.000028 closs: 0.7203 (0.7527) grad_norm: 0.3794 (0.4540) time: 5.5750 data: 0.0002 max mem: 71357 -[18:13:35.609566] Epoch: [2] [3030/6500] lr: 0.000028 closs: 0.7023 (0.7525) grad_norm: 0.4202 (0.4539) time: 5.5802 data: 0.0002 max mem: 71357 -[18:14:31.428499] Epoch: [2] [3040/6500] lr: 0.000028 closs: 0.7417 (0.7528) grad_norm: 0.3936 (0.4536) time: 5.5822 data: 0.0002 max mem: 71357 -[18:15:27.180218] Epoch: [2] [3050/6500] lr: 0.000028 closs: 0.7516 (0.7528) grad_norm: 0.3936 (0.4538) time: 5.5784 data: 0.0001 max mem: 71357 -[18:16:22.953546] Epoch: [2] [3060/6500] lr: 0.000028 closs: 0.7541 (0.7529) grad_norm: 0.4185 (0.4543) time: 5.5762 data: 0.0001 max mem: 71357 -[18:17:18.679769] Epoch: [2] [3070/6500] lr: 0.000028 closs: 0.7600 (0.7529) grad_norm: 0.4185 (0.4543) time: 5.5749 data: 0.0001 max mem: 71357 -[18:18:14.439626] Epoch: [2] [3080/6500] lr: 0.000028 closs: 0.7600 (0.7530) grad_norm: 0.4330 (0.4541) time: 5.5742 data: 0.0001 max mem: 71357 -[18:19:10.196084] Epoch: [2] [3090/6500] lr: 0.000028 closs: 0.7707 (0.7531) grad_norm: 0.4097 (0.4539) time: 5.5757 data: 0.0002 max mem: 71357 -[18:20:05.958990] Epoch: [2] [3100/6500] lr: 0.000028 closs: 0.7064 (0.7532) grad_norm: 0.4097 (0.4542) time: 5.5759 data: 0.0002 max mem: 71357 -[18:21:01.656755] Epoch: [2] [3110/6500] lr: 0.000028 closs: 0.7059 (0.7531) grad_norm: 0.4056 (0.4545) time: 5.5729 data: 0.0001 max mem: 71357 -[18:21:57.405447] Epoch: [2] [3120/6500] lr: 0.000028 closs: 0.7170 (0.7530) grad_norm: 0.3960 (0.4554) time: 5.5723 data: 0.0001 max mem: 71357 -[18:22:53.313307] Epoch: [2] [3130/6500] lr: 0.000028 closs: 0.7557 (0.7532) grad_norm: 0.3887 (0.4553) time: 5.5828 data: 0.0001 max mem: 71357 -[18:23:49.017260] Epoch: [2] [3140/6500] lr: 0.000028 closs: 0.8077 (0.7531) grad_norm: 0.3960 (0.4552) time: 5.5805 data: 0.0001 max mem: 71357 -[18:24:44.802215] Epoch: [2] [3150/6500] lr: 0.000028 closs: 0.7324 (0.7530) grad_norm: 0.3656 (0.4549) time: 5.5743 data: 0.0001 max mem: 71357 -[18:25:40.636892] Epoch: [2] [3160/6500] lr: 0.000028 closs: 0.6688 (0.7530) grad_norm: 0.4030 (0.4549) time: 5.5809 data: 0.0001 max mem: 71357 -[18:26:36.458261] Epoch: [2] [3170/6500] lr: 0.000028 closs: 0.7235 (0.7530) grad_norm: 0.4011 (0.4547) time: 5.5827 data: 0.0001 max mem: 71357 -[18:27:32.179589] Epoch: [2] [3180/6500] lr: 0.000028 closs: 0.7677 (0.7531) grad_norm: 0.3832 (0.4547) time: 5.5770 data: 0.0001 max mem: 71357 -[18:28:27.829819] Epoch: [2] [3190/6500] lr: 0.000028 closs: 0.7677 (0.7532) grad_norm: 0.4011 (0.4550) time: 5.5685 data: 0.0001 max mem: 71357 -[18:29:23.602288] Epoch: [2] [3200/6500] lr: 0.000028 closs: 0.6795 (0.7532) grad_norm: 0.4011 (0.4551) time: 5.5710 data: 0.0001 max mem: 71357 -[18:30:19.499138] Epoch: [2] [3210/6500] lr: 0.000028 closs: 0.6620 (0.7529) grad_norm: 0.4624 (0.4554) time: 5.5833 data: 0.0001 max mem: 71357 -[18:31:15.203601] Epoch: [2] [3220/6500] lr: 0.000028 closs: 0.7157 (0.7530) grad_norm: 0.4624 (0.4555) time: 5.5800 data: 0.0001 max mem: 71357 -[18:32:11.005898] Epoch: [2] [3230/6500] lr: 0.000028 closs: 0.8028 (0.7529) grad_norm: 0.3974 (0.4553) time: 5.5752 data: 0.0001 max mem: 71357 -[18:33:06.750451] Epoch: [2] [3240/6500] lr: 0.000028 closs: 0.7577 (0.7528) grad_norm: 0.3974 (0.4551) time: 5.5773 data: 0.0001 max mem: 71357 -[18:34:02.445923] Epoch: [2] [3250/6500] lr: 0.000028 closs: 0.6945 (0.7527) grad_norm: 0.3940 (0.4554) time: 5.5719 data: 0.0001 max mem: 71357 -[18:34:58.253353] Epoch: [2] [3260/6500] lr: 0.000027 closs: 0.6054 (0.7522) grad_norm: 0.4058 (0.4556) time: 5.5750 data: 0.0001 max mem: 71357 -[18:35:54.028810] Epoch: [2] [3270/6500] lr: 0.000027 closs: 0.6357 (0.7522) grad_norm: 0.4058 (0.4553) time: 5.5790 data: 0.0001 max mem: 71357 -[18:36:49.797666] Epoch: [2] [3280/6500] lr: 0.000027 closs: 0.7882 (0.7521) grad_norm: 0.4058 (0.4555) time: 5.5771 data: 0.0001 max mem: 71357 -[18:37:45.573523] Epoch: [2] [3290/6500] lr: 0.000027 closs: 0.7882 (0.7521) grad_norm: 0.3778 (0.4556) time: 5.5771 data: 0.0001 max mem: 71357 -[18:38:41.467647] Epoch: [2] [3300/6500] lr: 0.000027 closs: 0.7377 (0.7520) grad_norm: 0.3660 (0.4553) time: 5.5834 data: 0.0001 max mem: 71357 -[18:39:37.255941] Epoch: [2] [3310/6500] lr: 0.000027 closs: 0.6829 (0.7519) grad_norm: 0.3693 (0.4554) time: 5.5840 data: 0.0001 max mem: 71357 -[18:40:33.026676] Epoch: [2] [3320/6500] lr: 0.000027 closs: 0.7501 (0.7518) grad_norm: 0.3773 (0.4556) time: 5.5778 data: 0.0001 max mem: 71357 -[18:41:28.790780] Epoch: [2] [3330/6500] lr: 0.000027 closs: 0.7565 (0.7519) grad_norm: 0.3885 (0.4554) time: 5.5766 data: 0.0001 max mem: 71357 -[18:42:24.526741] Epoch: [2] [3340/6500] lr: 0.000027 closs: 0.7717 (0.7522) grad_norm: 0.4094 (0.4555) time: 5.5749 data: 0.0001 max mem: 71357 -[18:43:20.332717] Epoch: [2] [3350/6500] lr: 0.000027 closs: 0.7165 (0.7520) grad_norm: 0.4280 (0.4556) time: 5.5770 data: 0.0001 max mem: 71357 -[18:44:16.086465] Epoch: [2] [3360/6500] lr: 0.000027 closs: 0.6851 (0.7518) grad_norm: 0.4280 (0.4554) time: 5.5779 data: 0.0001 max mem: 71357 -[18:45:11.852035] Epoch: [2] [3370/6500] lr: 0.000027 closs: 0.6957 (0.7519) grad_norm: 0.4283 (0.4553) time: 5.5759 data: 0.0001 max mem: 71357 -[18:46:07.516148] Epoch: [2] [3380/6500] lr: 0.000027 closs: 0.7751 (0.7520) grad_norm: 0.4299 (0.4558) time: 5.5714 data: 0.0001 max mem: 71357 -[18:47:03.306747] Epoch: [2] [3390/6500] lr: 0.000027 closs: 0.7488 (0.7519) grad_norm: 0.4299 (0.4556) time: 5.5727 data: 0.0001 max mem: 71357 -[18:47:59.053434] Epoch: [2] [3400/6500] lr: 0.000027 closs: 0.7401 (0.7521) grad_norm: 0.4102 (0.4552) time: 5.5768 data: 0.0001 max mem: 71357 -[18:48:54.740746] Epoch: [2] [3410/6500] lr: 0.000027 closs: 0.7448 (0.7521) grad_norm: 0.4299 (0.4553) time: 5.5716 data: 0.0001 max mem: 71357 -[18:49:50.560422] Epoch: [2] [3420/6500] lr: 0.000027 closs: 0.7453 (0.7521) grad_norm: 0.3657 (0.4550) time: 5.5753 data: 0.0001 max mem: 71357 -[18:50:46.435930] Epoch: [2] [3430/6500] lr: 0.000027 closs: 0.7453 (0.7521) grad_norm: 0.3689 (0.4550) time: 5.5847 data: 0.0001 max mem: 71357 -[18:51:42.267551] Epoch: [2] [3440/6500] lr: 0.000027 closs: 0.7437 (0.7523) grad_norm: 0.4137 (0.4552) time: 5.5852 data: 0.0001 max mem: 71357 -[18:52:38.039434] Epoch: [2] [3450/6500] lr: 0.000027 closs: 0.7048 (0.7521) grad_norm: 0.4042 (0.4552) time: 5.5801 data: 0.0001 max mem: 71357 -[18:53:33.817847] Epoch: [2] [3460/6500] lr: 0.000027 closs: 0.7392 (0.7522) grad_norm: 0.4137 (0.4551) time: 5.5774 data: 0.0001 max mem: 71357 -[18:54:29.618361] Epoch: [2] [3470/6500] lr: 0.000027 closs: 0.7650 (0.7522) grad_norm: 0.4506 (0.4550) time: 5.5788 data: 0.0001 max mem: 71357 -[18:55:25.443455] Epoch: [2] [3480/6500] lr: 0.000027 closs: 0.7204 (0.7520) grad_norm: 0.3884 (0.4548) time: 5.5812 data: 0.0001 max mem: 71357 -[18:56:21.227026] Epoch: [2] [3490/6500] lr: 0.000027 closs: 0.7204 (0.7519) grad_norm: 0.3710 (0.4546) time: 5.5803 data: 0.0001 max mem: 71357 -[18:57:16.999656] Epoch: [2] [3500/6500] lr: 0.000027 closs: 0.7337 (0.7519) grad_norm: 0.3694 (0.4546) time: 5.5777 data: 0.0001 max mem: 71357 -[18:58:12.821955] Epoch: [2] [3510/6500] lr: 0.000027 closs: 0.7021 (0.7521) grad_norm: 0.3694 (0.4544) time: 5.5797 data: 0.0001 max mem: 71357 -[18:59:08.653591] Epoch: [2] [3520/6500] lr: 0.000027 closs: 0.7021 (0.7520) grad_norm: 0.3993 (0.4544) time: 5.5826 data: 0.0001 max mem: 71357 -[19:00:04.373591] Epoch: [2] [3530/6500] lr: 0.000026 closs: 0.6680 (0.7517) grad_norm: 0.4037 (0.4543) time: 5.5775 data: 0.0001 max mem: 71357 -[19:01:00.095231] Epoch: [2] [3540/6500] lr: 0.000026 closs: 0.6592 (0.7515) grad_norm: 0.3815 (0.4543) time: 5.5720 data: 0.0001 max mem: 71357 -[19:01:55.783676] Epoch: [2] [3550/6500] lr: 0.000026 closs: 0.7683 (0.7518) grad_norm: 0.3815 (0.4542) time: 5.5704 data: 0.0001 max mem: 71357 -[19:02:51.613530] Epoch: [2] [3560/6500] lr: 0.000026 closs: 0.7802 (0.7518) grad_norm: 0.3848 (0.4543) time: 5.5758 data: 0.0001 max mem: 71357 -[19:03:47.362602] Epoch: [2] [3570/6500] lr: 0.000026 closs: 0.7255 (0.7519) grad_norm: 0.4178 (0.4544) time: 5.5789 data: 0.0001 max mem: 71357 -[19:04:43.065929] Epoch: [2] [3580/6500] lr: 0.000026 closs: 0.7179 (0.7519) grad_norm: 0.4329 (0.4545) time: 5.5726 data: 0.0001 max mem: 71357 -[19:05:38.869463] Epoch: [2] [3590/6500] lr: 0.000026 closs: 0.7179 (0.7518) grad_norm: 0.4329 (0.4543) time: 5.5752 data: 0.0001 max mem: 71357 -[19:06:34.613911] Epoch: [2] [3600/6500] lr: 0.000026 closs: 0.7426 (0.7519) grad_norm: 0.4353 (0.4545) time: 5.5773 data: 0.0001 max mem: 71357 -[19:07:30.425204] Epoch: [2] [3610/6500] lr: 0.000026 closs: 0.7318 (0.7520) grad_norm: 0.4175 (0.4543) time: 5.5777 data: 0.0001 max mem: 71357 -[19:08:26.219365] Epoch: [2] [3620/6500] lr: 0.000026 closs: 0.7082 (0.7519) grad_norm: 0.4031 (0.4542) time: 5.5802 data: 0.0001 max mem: 71357 -[19:09:21.945457] Epoch: [2] [3630/6500] lr: 0.000026 closs: 0.7118 (0.7519) grad_norm: 0.4031 (0.4574) time: 5.5759 data: 0.0001 max mem: 71357 -[19:10:17.768386] Epoch: [2] [3640/6500] lr: 0.000026 closs: 0.7003 (0.7517) grad_norm: 0.4097 (0.4575) time: 5.5774 data: 0.0001 max mem: 71357 -[19:11:13.634197] Epoch: [2] [3650/6500] lr: 0.000026 closs: 0.6395 (0.7513) grad_norm: 0.4272 (0.4582) time: 5.5843 data: 0.0001 max mem: 71357 -[19:12:09.273361] Epoch: [2] [3660/6500] lr: 0.000026 closs: 0.5753 (0.7509) grad_norm: 0.4639 (0.4585) time: 5.5751 data: 0.0001 max mem: 71357 -[19:13:05.018929] Epoch: [2] [3670/6500] lr: 0.000026 closs: 0.7002 (0.7509) grad_norm: 0.4504 (0.4583) time: 5.5691 data: 0.0001 max mem: 71357 -[19:14:00.765347] Epoch: [2] [3680/6500] lr: 0.000026 closs: 0.7512 (0.7510) grad_norm: 0.3919 (0.4580) time: 5.5745 data: 0.0001 max mem: 71357 -[19:14:56.592826] Epoch: [2] [3690/6500] lr: 0.000026 closs: 0.7369 (0.7511) grad_norm: 0.4039 (0.4580) time: 5.5786 data: 0.0001 max mem: 71357 -[19:15:52.353357] Epoch: [2] [3700/6500] lr: 0.000026 closs: 0.7269 (0.7511) grad_norm: 0.4066 (0.4582) time: 5.5793 data: 0.0001 max mem: 71357 -[19:16:48.075930] Epoch: [2] [3710/6500] lr: 0.000026 closs: 0.7604 (0.7514) grad_norm: 0.4211 (0.4584) time: 5.5741 data: 0.0001 max mem: 71357 -[19:17:43.807793] Epoch: [2] [3720/6500] lr: 0.000026 closs: 0.7592 (0.7515) grad_norm: 0.4212 (0.4582) time: 5.5726 data: 0.0001 max mem: 71357 -[19:18:39.589533] Epoch: [2] [3730/6500] lr: 0.000026 closs: 0.7153 (0.7513) grad_norm: 0.4276 (0.4583) time: 5.5756 data: 0.0001 max mem: 71357 -[19:19:35.517625] Epoch: [2] [3740/6500] lr: 0.000026 closs: 0.6791 (0.7514) grad_norm: 0.4190 (0.4583) time: 5.5854 data: 0.0001 max mem: 71357 -[19:20:31.333293] Epoch: [2] [3750/6500] lr: 0.000026 closs: 0.7668 (0.7515) grad_norm: 0.3620 (0.4579) time: 5.5871 data: 0.0001 max mem: 71357 -[19:21:27.133335] Epoch: [2] [3760/6500] lr: 0.000026 closs: 0.7307 (0.7514) grad_norm: 0.3646 (0.4580) time: 5.5807 data: 0.0001 max mem: 71357 -[19:22:22.832460] Epoch: [2] [3770/6500] lr: 0.000026 closs: 0.6995 (0.7513) grad_norm: 0.3620 (0.4578) time: 5.5749 data: 0.0001 max mem: 71357 -[19:23:18.596759] Epoch: [2] [3780/6500] lr: 0.000026 closs: 0.6823 (0.7511) grad_norm: 0.3688 (0.4578) time: 5.5731 data: 0.0001 max mem: 71357 -[19:24:14.511863] Epoch: [2] [3790/6500] lr: 0.000026 closs: 0.6904 (0.7510) grad_norm: 0.4127 (0.4578) time: 5.5838 data: 0.0002 max mem: 71357 -[19:25:10.290788] Epoch: [2] [3800/6500] lr: 0.000026 closs: 0.7732 (0.7511) grad_norm: 0.3918 (0.4576) time: 5.5846 data: 0.0002 max mem: 71357 -[19:26:06.038060] Epoch: [2] [3810/6500] lr: 0.000025 closs: 0.8008 (0.7514) grad_norm: 0.4030 (0.4576) time: 5.5762 data: 0.0001 max mem: 71357 -[19:27:01.862060] Epoch: [2] [3820/6500] lr: 0.000025 closs: 0.7921 (0.7515) grad_norm: 0.4030 (0.4575) time: 5.5785 data: 0.0001 max mem: 71357 -[19:27:57.761778] Epoch: [2] [3830/6500] lr: 0.000025 closs: 0.7190 (0.7514) grad_norm: 0.4133 (0.4574) time: 5.5861 data: 0.0001 max mem: 71357 -[19:28:53.629998] Epoch: [2] [3840/6500] lr: 0.000025 closs: 0.7989 (0.7516) grad_norm: 0.4133 (0.4576) time: 5.5883 data: 0.0001 max mem: 71357 -[19:29:49.375750] Epoch: [2] [3850/6500] lr: 0.000025 closs: 0.7985 (0.7515) grad_norm: 0.4121 (0.4574) time: 5.5806 data: 0.0001 max mem: 71357 -[19:30:45.134749] Epoch: [2] [3860/6500] lr: 0.000025 closs: 0.7189 (0.7515) grad_norm: 0.4118 (0.4572) time: 5.5751 data: 0.0001 max mem: 71357 -[19:31:41.021523] Epoch: [2] [3870/6500] lr: 0.000025 closs: 0.7189 (0.7515) grad_norm: 0.3507 (0.4573) time: 5.5822 data: 0.0001 max mem: 71357 -[19:32:36.805285] Epoch: [2] [3880/6500] lr: 0.000025 closs: 0.7183 (0.7515) grad_norm: 0.3621 (0.4572) time: 5.5835 data: 0.0001 max mem: 71357 -[19:33:32.498955] Epoch: [2] [3890/6500] lr: 0.000025 closs: 0.7183 (0.7517) grad_norm: 0.4034 (0.4573) time: 5.5738 data: 0.0001 max mem: 71357 -[19:34:28.271499] Epoch: [2] [3900/6500] lr: 0.000025 closs: 0.7171 (0.7516) grad_norm: 0.4034 (0.4572) time: 5.5732 data: 0.0001 max mem: 71357 -[19:35:24.068345] Epoch: [2] [3910/6500] lr: 0.000025 closs: 0.7171 (0.7516) grad_norm: 0.4155 (0.4572) time: 5.5784 data: 0.0001 max mem: 71357 -[19:36:19.846821] Epoch: [2] [3920/6500] lr: 0.000025 closs: 0.7395 (0.7517) grad_norm: 0.4235 (0.4572) time: 5.5787 data: 0.0001 max mem: 71357 -[19:37:15.586916] Epoch: [2] [3930/6500] lr: 0.000025 closs: 0.7115 (0.7516) grad_norm: 0.4147 (0.4570) time: 5.5758 data: 0.0001 max mem: 71357 -[19:38:11.257908] Epoch: [2] [3940/6500] lr: 0.000025 closs: 0.7486 (0.7519) grad_norm: 0.4114 (0.4569) time: 5.5704 data: 0.0001 max mem: 71357 -[19:39:07.044937] Epoch: [2] [3950/6500] lr: 0.000025 closs: 0.7479 (0.7518) grad_norm: 0.3677 (0.4569) time: 5.5728 data: 0.0002 max mem: 71357 -[19:40:02.943403] Epoch: [2] [3960/6500] lr: 0.000025 closs: 0.7034 (0.7517) grad_norm: 0.3595 (0.4567) time: 5.5842 data: 0.0002 max mem: 71357 -[19:40:58.634133] Epoch: [2] [3970/6500] lr: 0.000025 closs: 0.6918 (0.7517) grad_norm: 0.3632 (0.4566) time: 5.5794 data: 0.0001 max mem: 71357 -[19:41:54.409209] Epoch: [2] [3980/6500] lr: 0.000025 closs: 0.6974 (0.7517) grad_norm: 0.3576 (0.4563) time: 5.5732 data: 0.0001 max mem: 71357 -[19:42:50.131763] Epoch: [2] [3990/6500] lr: 0.000025 closs: 0.7662 (0.7519) grad_norm: 0.3603 (0.4562) time: 5.5748 data: 0.0001 max mem: 71357 -[19:43:45.912721] Epoch: [2] [4000/6500] lr: 0.000025 closs: 0.7659 (0.7518) grad_norm: 0.3586 (0.4560) time: 5.5751 data: 0.0001 max mem: 71357 -[19:44:41.737617] Epoch: [2] [4010/6500] lr: 0.000025 closs: 0.6833 (0.7515) grad_norm: 0.3591 (0.4560) time: 5.5802 data: 0.0001 max mem: 71357 -[19:45:37.533836] Epoch: [2] [4020/6500] lr: 0.000025 closs: 0.6835 (0.7515) grad_norm: 0.3779 (0.4559) time: 5.5810 data: 0.0001 max mem: 71357 -[19:46:33.370425] Epoch: [2] [4030/6500] lr: 0.000025 closs: 0.7941 (0.7518) grad_norm: 0.3591 (0.4558) time: 5.5816 data: 0.0001 max mem: 71357 -[19:47:29.110704] Epoch: [2] [4040/6500] lr: 0.000025 closs: 0.8463 (0.7520) grad_norm: 0.4270 (0.4560) time: 5.5788 data: 0.0001 max mem: 71357 -[19:48:24.981624] Epoch: [2] [4050/6500] lr: 0.000025 closs: 0.7672 (0.7521) grad_norm: 0.4270 (0.4562) time: 5.5805 data: 0.0001 max mem: 71357 -[19:49:20.791100] Epoch: [2] [4060/6500] lr: 0.000025 closs: 0.7486 (0.7521) grad_norm: 0.4354 (0.4561) time: 5.5839 data: 0.0001 max mem: 71357 -[19:50:16.553534] Epoch: [2] [4070/6500] lr: 0.000025 closs: 0.7918 (0.7525) grad_norm: 0.4397 (0.4561) time: 5.5785 data: 0.0001 max mem: 71357 -[19:51:12.298445] Epoch: [2] [4080/6500] lr: 0.000025 closs: 0.8232 (0.7524) grad_norm: 0.4222 (0.4559) time: 5.5753 data: 0.0001 max mem: 71357 -[19:52:08.214197] Epoch: [2] [4090/6500] lr: 0.000024 closs: 0.6875 (0.7523) grad_norm: 0.3943 (0.4558) time: 5.5829 data: 0.0001 max mem: 71357 -[19:53:03.923183] Epoch: [2] [4100/6500] lr: 0.000024 closs: 0.6875 (0.7524) grad_norm: 0.3943 (0.4558) time: 5.5811 data: 0.0001 max mem: 71357 -[19:53:59.656143] Epoch: [2] [4110/6500] lr: 0.000024 closs: 0.7027 (0.7524) grad_norm: 0.3542 (0.4555) time: 5.5720 data: 0.0001 max mem: 71357 -[19:54:55.392527] Epoch: [2] [4120/6500] lr: 0.000024 closs: 0.7027 (0.7524) grad_norm: 0.3617 (0.4554) time: 5.5734 data: 0.0001 max mem: 71357 -[19:55:51.121119] Epoch: [2] [4130/6500] lr: 0.000024 closs: 0.7443 (0.7525) grad_norm: 0.4015 (0.4557) time: 5.5731 data: 0.0001 max mem: 71357 -[19:56:47.017662] Epoch: [2] [4140/6500] lr: 0.000024 closs: 0.6968 (0.7523) grad_norm: 0.4069 (0.4558) time: 5.5812 data: 0.0001 max mem: 71357 -[19:57:42.833295] Epoch: [2] [4150/6500] lr: 0.000024 closs: 0.6998 (0.7525) grad_norm: 0.4234 (0.4557) time: 5.5855 data: 0.0001 max mem: 71357 -[19:58:38.652202] Epoch: [2] [4160/6500] lr: 0.000024 closs: 0.7783 (0.7526) grad_norm: 0.4234 (0.4561) time: 5.5816 data: 0.0001 max mem: 71357 -[19:59:34.468532] Epoch: [2] [4170/6500] lr: 0.000024 closs: 0.7338 (0.7524) grad_norm: 0.4316 (0.4561) time: 5.5816 data: 0.0002 max mem: 71357 -[20:00:30.388864] Epoch: [2] [4180/6500] lr: 0.000024 closs: 0.6694 (0.7522) grad_norm: 0.3939 (0.4559) time: 5.5866 data: 0.0002 max mem: 71357 -[20:01:26.113858] Epoch: [2] [4190/6500] lr: 0.000024 closs: 0.6837 (0.7522) grad_norm: 0.3998 (0.4559) time: 5.5820 data: 0.0001 max mem: 71357 -[20:02:21.829528] Epoch: [2] [4200/6500] lr: 0.000024 closs: 0.6585 (0.7517) grad_norm: 0.3751 (0.4557) time: 5.5720 data: 0.0001 max mem: 71357 -[20:03:17.604842] Epoch: [2] [4210/6500] lr: 0.000024 closs: 0.6701 (0.7517) grad_norm: 0.3748 (0.4598) time: 5.5745 data: 0.0001 max mem: 71357 -[20:04:13.477797] Epoch: [2] [4220/6500] lr: 0.000024 closs: 0.7236 (0.7518) grad_norm: 0.3748 (0.4596) time: 5.5823 data: 0.0001 max mem: 71357 -[20:05:09.384343] Epoch: [2] [4230/6500] lr: 0.000024 closs: 0.7549 (0.7519) grad_norm: 0.3470 (0.4595) time: 5.5889 data: 0.0001 max mem: 71357 -[20:06:05.120629] Epoch: [2] [4240/6500] lr: 0.000024 closs: 0.8058 (0.7521) grad_norm: 0.3470 (0.4594) time: 5.5820 data: 0.0001 max mem: 71357 -[20:07:00.817213] Epoch: [2] [4250/6500] lr: 0.000024 closs: 0.7658 (0.7519) grad_norm: 0.3538 (0.4594) time: 5.5716 data: 0.0001 max mem: 71357 -[20:07:56.580927] Epoch: [2] [4260/6500] lr: 0.000024 closs: 0.6630 (0.7518) grad_norm: 0.3951 (0.4593) time: 5.5729 data: 0.0001 max mem: 71357 -[20:08:52.369538] Epoch: [2] [4270/6500] lr: 0.000024 closs: 0.6919 (0.7519) grad_norm: 0.3951 (0.4592) time: 5.5775 data: 0.0001 max mem: 71357 -[20:09:48.116804] Epoch: [2] [4280/6500] lr: 0.000024 closs: 0.7422 (0.7519) grad_norm: 0.3951 (0.4593) time: 5.5767 data: 0.0001 max mem: 71357 -[20:10:43.991384] Epoch: [2] [4290/6500] lr: 0.000024 closs: 0.7422 (0.7519) grad_norm: 0.3758 (0.4593) time: 5.5810 data: 0.0001 max mem: 71357 -[20:11:39.673183] Epoch: [2] [4300/6500] lr: 0.000024 closs: 0.6998 (0.7519) grad_norm: 0.3586 (0.4592) time: 5.5777 data: 0.0001 max mem: 71357 -[20:12:35.691999] Epoch: [2] [4310/6500] lr: 0.000024 closs: 0.6928 (0.7518) grad_norm: 0.3542 (0.4589) time: 5.5849 data: 0.0001 max mem: 71357 -[20:13:31.414107] Epoch: [2] [4320/6500] lr: 0.000024 closs: 0.7853 (0.7521) grad_norm: 0.3542 (0.4591) time: 5.5870 data: 0.0001 max mem: 71357 -[20:14:27.153355] Epoch: [2] [4330/6500] lr: 0.000024 closs: 0.7853 (0.7521) grad_norm: 0.3800 (0.4590) time: 5.5730 data: 0.0001 max mem: 71357 -[20:15:22.884207] Epoch: [2] [4340/6500] lr: 0.000024 closs: 0.7557 (0.7522) grad_norm: 0.3681 (0.4590) time: 5.5734 data: 0.0001 max mem: 71357 -[20:16:18.717580] Epoch: [2] [4350/6500] lr: 0.000024 closs: 0.7819 (0.7523) grad_norm: 0.3800 (0.4589) time: 5.5781 data: 0.0001 max mem: 71357 -[20:17:14.490059] Epoch: [2] [4360/6500] lr: 0.000023 closs: 0.7472 (0.7523) grad_norm: 0.4156 (0.4590) time: 5.5802 data: 0.0001 max mem: 71357 -[20:18:10.275532] Epoch: [2] [4370/6500] lr: 0.000023 closs: 0.7444 (0.7523) grad_norm: 0.3810 (0.4588) time: 5.5778 data: 0.0001 max mem: 71357 -[20:19:06.010266] Epoch: [2] [4380/6500] lr: 0.000023 closs: 0.7763 (0.7524) grad_norm: 0.4355 (0.4589) time: 5.5759 data: 0.0001 max mem: 71357 -[20:20:01.808150] Epoch: [2] [4390/6500] lr: 0.000023 closs: 0.7244 (0.7523) grad_norm: 0.4355 (0.4589) time: 5.5766 data: 0.0001 max mem: 71357 -[20:20:57.625348] Epoch: [2] [4400/6500] lr: 0.000023 closs: 0.7288 (0.7523) grad_norm: 0.3992 (0.4587) time: 5.5807 data: 0.0001 max mem: 71357 -[20:21:53.324472] Epoch: [2] [4410/6500] lr: 0.000023 closs: 0.7448 (0.7522) grad_norm: 0.4006 (0.4586) time: 5.5757 data: 0.0001 max mem: 71357 -[20:22:49.049941] Epoch: [2] [4420/6500] lr: 0.000023 closs: 0.7451 (0.7522) grad_norm: 0.4197 (0.4587) time: 5.5712 data: 0.0001 max mem: 71357 -[20:23:44.748732] Epoch: [2] [4430/6500] lr: 0.000023 closs: 0.7624 (0.7522) grad_norm: 0.3477 (0.4584) time: 5.5711 data: 0.0001 max mem: 71357 -[20:24:40.497376] Epoch: [2] [4440/6500] lr: 0.000023 closs: 0.7418 (0.7522) grad_norm: 0.4209 (0.4585) time: 5.5723 data: 0.0001 max mem: 71357 -[20:25:36.293414] Epoch: [2] [4450/6500] lr: 0.000023 closs: 0.7588 (0.7522) grad_norm: 0.3757 (0.4587) time: 5.5772 data: 0.0001 max mem: 71357 -[20:26:31.999472] Epoch: [2] [4460/6500] lr: 0.000023 closs: 0.7787 (0.7523) grad_norm: 0.3697 (0.4586) time: 5.5750 data: 0.0001 max mem: 71357 -[20:27:27.726368] Epoch: [2] [4470/6500] lr: 0.000023 closs: 0.8265 (0.7524) grad_norm: 0.3947 (0.4585) time: 5.5715 data: 0.0001 max mem: 71357 -[20:28:23.380473] Epoch: [2] [4480/6500] lr: 0.000023 closs: 0.7835 (0.7524) grad_norm: 0.3890 (0.4584) time: 5.5689 data: 0.0001 max mem: 71357 -[20:29:19.182199] Epoch: [2] [4490/6500] lr: 0.000023 closs: 0.7434 (0.7525) grad_norm: 0.4197 (0.4584) time: 5.5727 data: 0.0001 max mem: 71357 -[20:30:14.955737] Epoch: [2] [4500/6500] lr: 0.000023 closs: 0.7503 (0.7527) grad_norm: 0.4197 (0.4583) time: 5.5787 data: 0.0001 max mem: 71357 -[20:31:10.618074] Epoch: [2] [4510/6500] lr: 0.000023 closs: 0.7323 (0.7525) grad_norm: 0.3890 (0.4581) time: 5.5717 data: 0.0001 max mem: 71357 -[20:32:06.333679] Epoch: [2] [4520/6500] lr: 0.000023 closs: 0.7582 (0.7526) grad_norm: 0.4145 (0.4580) time: 5.5688 data: 0.0001 max mem: 71357 -[20:33:02.124341] Epoch: [2] [4530/6500] lr: 0.000023 closs: 0.7793 (0.7527) grad_norm: 0.3605 (0.4579) time: 5.5752 data: 0.0001 max mem: 71357 -[20:33:57.818097] Epoch: [2] [4540/6500] lr: 0.000023 closs: 0.7636 (0.7527) grad_norm: 0.3769 (0.4582) time: 5.5741 data: 0.0001 max mem: 71357 -[20:34:53.515671] Epoch: [2] [4550/6500] lr: 0.000023 closs: 0.7089 (0.7527) grad_norm: 0.3769 (0.4582) time: 5.5695 data: 0.0001 max mem: 71357 -[20:35:49.117517] Epoch: [2] [4560/6500] lr: 0.000023 closs: 0.7042 (0.7527) grad_norm: 0.3902 (0.4581) time: 5.5649 data: 0.0001 max mem: 71357 -[20:36:44.782322] Epoch: [2] [4570/6500] lr: 0.000023 closs: 0.7926 (0.7528) grad_norm: 0.4067 (0.4581) time: 5.5632 data: 0.0001 max mem: 71357 -[20:37:40.621732] Epoch: [2] [4580/6500] lr: 0.000023 closs: 0.7278 (0.7527) grad_norm: 0.4201 (0.4580) time: 5.5751 data: 0.0001 max mem: 71357 -[20:38:36.392047] Epoch: [2] [4590/6500] lr: 0.000023 closs: 0.6795 (0.7525) grad_norm: 0.4201 (0.4580) time: 5.5804 data: 0.0001 max mem: 71357 -[20:39:32.120868] Epoch: [2] [4600/6500] lr: 0.000023 closs: 0.6879 (0.7525) grad_norm: 0.4157 (0.4579) time: 5.5749 data: 0.0001 max mem: 71357 -[20:40:27.907400] Epoch: [2] [4610/6500] lr: 0.000023 closs: 0.7533 (0.7526) grad_norm: 0.4157 (0.4580) time: 5.5757 data: 0.0001 max mem: 71357 -[20:41:23.723245] Epoch: [2] [4620/6500] lr: 0.000023 closs: 0.7976 (0.7527) grad_norm: 0.4157 (0.4581) time: 5.5800 data: 0.0001 max mem: 71357 -[20:42:19.443021] Epoch: [2] [4630/6500] lr: 0.000023 closs: 0.8094 (0.7528) grad_norm: 0.4242 (0.4581) time: 5.5767 data: 0.0001 max mem: 71357 -[20:43:15.254542] Epoch: [2] [4640/6500] lr: 0.000023 closs: 0.8314 (0.7530) grad_norm: 0.4242 (0.4580) time: 5.5764 data: 0.0001 max mem: 71357 -[20:44:11.113583] Epoch: [2] [4650/6500] lr: 0.000022 closs: 0.7892 (0.7530) grad_norm: 0.4242 (0.4579) time: 5.5834 data: 0.0001 max mem: 71357 -[20:45:06.929343] Epoch: [2] [4660/6500] lr: 0.000022 closs: 0.7705 (0.7531) grad_norm: 0.3678 (0.4577) time: 5.5836 data: 0.0001 max mem: 71357 -[20:46:02.726719] Epoch: [2] [4670/6500] lr: 0.000022 closs: 0.7750 (0.7532) grad_norm: 0.4022 (0.4577) time: 5.5806 data: 0.0001 max mem: 71357 -[20:46:58.391264] Epoch: [2] [4680/6500] lr: 0.000022 closs: 0.7492 (0.7534) grad_norm: 0.4199 (0.4578) time: 5.5730 data: 0.0001 max mem: 71357 -[20:47:54.137636] Epoch: [2] [4690/6500] lr: 0.000022 closs: 0.7607 (0.7534) grad_norm: 0.4330 (0.4579) time: 5.5705 data: 0.0001 max mem: 71357 -[20:48:49.866982] Epoch: [2] [4700/6500] lr: 0.000022 closs: 0.7612 (0.7535) grad_norm: 0.4420 (0.4581) time: 5.5737 data: 0.0001 max mem: 71357 -[20:49:45.726989] Epoch: [2] [4710/6500] lr: 0.000022 closs: 0.7462 (0.7535) grad_norm: 0.4460 (0.4581) time: 5.5794 data: 0.0002 max mem: 71357 -[20:50:41.469292] Epoch: [2] [4720/6500] lr: 0.000022 closs: 0.7418 (0.7533) grad_norm: 0.4460 (0.4580) time: 5.5800 data: 0.0002 max mem: 71357 -[20:51:37.246552] Epoch: [2] [4730/6500] lr: 0.000022 closs: 0.7053 (0.7532) grad_norm: 0.4460 (0.4580) time: 5.5759 data: 0.0001 max mem: 71357 -[20:52:33.006985] Epoch: [2] [4740/6500] lr: 0.000022 closs: 0.7434 (0.7532) grad_norm: 0.3723 (0.4579) time: 5.5768 data: 0.0001 max mem: 71357 -[20:53:28.828753] Epoch: [2] [4750/6500] lr: 0.000022 closs: 0.7571 (0.7532) grad_norm: 0.3702 (0.4580) time: 5.5790 data: 0.0001 max mem: 71357 -[20:54:24.592903] Epoch: [2] [4760/6500] lr: 0.000022 closs: 0.7973 (0.7534) grad_norm: 0.3789 (0.4579) time: 5.5792 data: 0.0001 max mem: 71357 -[20:55:20.352634] Epoch: [2] [4770/6500] lr: 0.000022 closs: 0.7573 (0.7534) grad_norm: 0.3854 (0.4581) time: 5.5761 data: 0.0001 max mem: 71357 -[20:56:16.111581] Epoch: [2] [4780/6500] lr: 0.000022 closs: 0.7458 (0.7534) grad_norm: 0.3854 (0.4584) time: 5.5758 data: 0.0001 max mem: 71357 -[20:57:11.915746] Epoch: [2] [4790/6500] lr: 0.000022 closs: 0.7730 (0.7534) grad_norm: 0.3789 (0.4581) time: 5.5780 data: 0.0001 max mem: 71357 -[20:58:07.707096] Epoch: [2] [4800/6500] lr: 0.000022 closs: 0.7921 (0.7534) grad_norm: 0.3786 (0.4579) time: 5.5797 data: 0.0001 max mem: 71357 -[20:59:03.442025] Epoch: [2] [4810/6500] lr: 0.000022 closs: 0.8092 (0.7535) grad_norm: 0.3616 (0.4578) time: 5.5762 data: 0.0001 max mem: 71357 -[20:59:59.160618] Epoch: [2] [4820/6500] lr: 0.000022 closs: 0.8096 (0.7535) grad_norm: 0.3616 (0.4579) time: 5.5726 data: 0.0001 max mem: 71357 -[21:00:55.024537] Epoch: [2] [4830/6500] lr: 0.000022 closs: 0.7769 (0.7535) grad_norm: 0.3820 (0.4578) time: 5.5791 data: 0.0001 max mem: 71357 -[21:01:50.864427] Epoch: [2] [4840/6500] lr: 0.000022 closs: 0.8090 (0.7537) grad_norm: 0.3829 (0.4577) time: 5.5851 data: 0.0001 max mem: 71357 -[21:02:46.526123] Epoch: [2] [4850/6500] lr: 0.000022 closs: 0.7622 (0.7536) grad_norm: 0.3724 (0.4575) time: 5.5750 data: 0.0001 max mem: 71357 -[21:03:42.259788] Epoch: [2] [4860/6500] lr: 0.000022 closs: 0.6715 (0.7535) grad_norm: 0.3909 (0.4574) time: 5.5696 data: 0.0002 max mem: 71357 -[21:04:37.979185] Epoch: [2] [4870/6500] lr: 0.000022 closs: 0.7794 (0.7535) grad_norm: 0.3939 (0.4574) time: 5.5725 data: 0.0002 max mem: 71357 -[21:05:33.778939] Epoch: [2] [4880/6500] lr: 0.000022 closs: 0.7504 (0.7535) grad_norm: 0.3825 (0.4576) time: 5.5758 data: 0.0002 max mem: 71357 -[21:06:29.677234] Epoch: [2] [4890/6500] lr: 0.000022 closs: 0.7927 (0.7536) grad_norm: 0.3982 (0.4581) time: 5.5848 data: 0.0002 max mem: 71357 -[21:07:25.514489] Epoch: [2] [4900/6500] lr: 0.000022 closs: 0.7927 (0.7536) grad_norm: 0.3893 (0.4580) time: 5.5867 data: 0.0002 max mem: 71357 -[21:08:21.434487] Epoch: [2] [4910/6500] lr: 0.000022 closs: 0.6924 (0.7535) grad_norm: 0.3806 (0.4577) time: 5.5877 data: 0.0002 max mem: 71357 -[21:09:17.248775] Epoch: [2] [4920/6500] lr: 0.000022 closs: 0.6780 (0.7535) grad_norm: 0.3656 (0.4577) time: 5.5866 data: 0.0002 max mem: 71357 -[21:10:13.285780] Epoch: [2] [4930/6500] lr: 0.000021 closs: 0.7640 (0.7536) grad_norm: 0.3893 (0.4577) time: 5.5925 data: 0.0003 max mem: 71357 -[21:11:09.052304] Epoch: [2] [4940/6500] lr: 0.000021 closs: 0.7222 (0.7535) grad_norm: 0.3819 (0.4575) time: 5.5901 data: 0.0002 max mem: 71357 -[21:12:04.750288] Epoch: [2] [4950/6500] lr: 0.000021 closs: 0.7536 (0.7536) grad_norm: 0.3978 (0.4575) time: 5.5731 data: 0.0001 max mem: 71357 -[21:13:00.512784] Epoch: [2] [4960/6500] lr: 0.000021 closs: 0.7589 (0.7535) grad_norm: 0.3978 (0.4574) time: 5.5729 data: 0.0001 max mem: 71357 -[21:13:56.368221] Epoch: [2] [4970/6500] lr: 0.000021 closs: 0.6880 (0.7534) grad_norm: 0.4164 (0.4574) time: 5.5808 data: 0.0001 max mem: 71357 -[21:14:52.098157] Epoch: [2] [4980/6500] lr: 0.000021 closs: 0.6675 (0.7533) grad_norm: 0.4466 (0.4576) time: 5.5792 data: 0.0002 max mem: 71357 -[21:15:47.834844] Epoch: [2] [4990/6500] lr: 0.000021 closs: 0.7294 (0.7534) grad_norm: 0.4218 (0.4576) time: 5.5732 data: 0.0002 max mem: 71357 -[21:16:43.616335] Epoch: [2] [5000/6500] lr: 0.000021 closs: 0.7294 (0.7533) grad_norm: 0.4392 (0.4575) time: 5.5758 data: 0.0001 max mem: 71357 -[21:17:39.418621] Epoch: [2] [5010/6500] lr: 0.000021 closs: 0.6945 (0.7532) grad_norm: 0.3921 (0.4575) time: 5.5791 data: 0.0001 max mem: 71357 -[21:18:35.268740] Epoch: [2] [5020/6500] lr: 0.000021 closs: 0.7293 (0.7533) grad_norm: 0.3777 (0.4574) time: 5.5825 data: 0.0001 max mem: 71357 -[21:19:31.074686] Epoch: [2] [5030/6500] lr: 0.000021 closs: 0.7293 (0.7532) grad_norm: 0.3928 (0.4573) time: 5.5827 data: 0.0002 max mem: 71357 -[21:20:26.820398] Epoch: [2] [5040/6500] lr: 0.000021 closs: 0.7199 (0.7531) grad_norm: 0.4195 (0.4573) time: 5.5774 data: 0.0002 max mem: 71357 -[21:21:22.626457] Epoch: [2] [5050/6500] lr: 0.000021 closs: 0.7394 (0.7530) grad_norm: 0.3752 (0.4572) time: 5.5775 data: 0.0001 max mem: 71357 -[21:22:18.529363] Epoch: [2] [5060/6500] lr: 0.000021 closs: 0.6648 (0.7528) grad_norm: 0.3752 (0.4571) time: 5.5854 data: 0.0001 max mem: 71357 -[21:23:14.371468] Epoch: [2] [5070/6500] lr: 0.000021 closs: 0.6893 (0.7528) grad_norm: 0.3752 (0.4574) time: 5.5872 data: 0.0002 max mem: 71357 -[21:24:10.166597] Epoch: [2] [5080/6500] lr: 0.000021 closs: 0.8464 (0.7531) grad_norm: 0.3935 (0.4573) time: 5.5818 data: 0.0002 max mem: 71357 -[21:25:06.007932] Epoch: [2] [5090/6500] lr: 0.000021 closs: 0.7938 (0.7531) grad_norm: 0.4019 (0.4572) time: 5.5817 data: 0.0002 max mem: 71357 -[21:26:01.836094] Epoch: [2] [5100/6500] lr: 0.000021 closs: 0.7887 (0.7532) grad_norm: 0.4274 (0.4577) time: 5.5833 data: 0.0002 max mem: 71357 -[21:26:57.816257] Epoch: [2] [5110/6500] lr: 0.000021 closs: 0.7349 (0.7531) grad_norm: 0.4136 (0.4577) time: 5.5903 data: 0.0002 max mem: 71357 -[21:27:53.625174] Epoch: [2] [5120/6500] lr: 0.000021 closs: 0.7168 (0.7532) grad_norm: 0.4136 (0.4575) time: 5.5893 data: 0.0003 max mem: 71357 -[21:28:49.364341] Epoch: [2] [5130/6500] lr: 0.000021 closs: 0.7883 (0.7531) grad_norm: 0.4200 (0.4574) time: 5.5773 data: 0.0002 max mem: 71357 -[21:29:45.116681] Epoch: [2] [5140/6500] lr: 0.000021 closs: 0.8003 (0.7533) grad_norm: 0.3964 (0.4575) time: 5.5744 data: 0.0002 max mem: 71357 -[21:30:41.097664] Epoch: [2] [5150/6500] lr: 0.000021 closs: 0.7947 (0.7533) grad_norm: 0.4253 (0.4577) time: 5.5865 data: 0.0002 max mem: 71357 -[21:31:36.852342] Epoch: [2] [5160/6500] lr: 0.000021 closs: 0.6738 (0.7530) grad_norm: 0.4253 (0.4575) time: 5.5866 data: 0.0002 max mem: 71357 -[21:32:32.634601] Epoch: [2] [5170/6500] lr: 0.000021 closs: 0.6738 (0.7530) grad_norm: 0.4441 (0.4575) time: 5.5767 data: 0.0002 max mem: 71357 -[21:33:28.375412] Epoch: [2] [5180/6500] lr: 0.000021 closs: 0.7572 (0.7529) grad_norm: 0.4158 (0.4575) time: 5.5760 data: 0.0001 max mem: 71357 -[21:34:24.373049] Epoch: [2] [5190/6500] lr: 0.000021 closs: 0.7376 (0.7530) grad_norm: 0.3803 (0.4574) time: 5.5868 data: 0.0001 max mem: 71357 -[21:35:20.090392] Epoch: [2] [5200/6500] lr: 0.000021 closs: 0.7732 (0.7529) grad_norm: 0.3696 (0.4575) time: 5.5856 data: 0.0001 max mem: 71357 -[21:36:15.790968] Epoch: [2] [5210/6500] lr: 0.000021 closs: 0.7453 (0.7529) grad_norm: 0.3696 (0.4573) time: 5.5708 data: 0.0001 max mem: 71357 -[21:37:11.636798] Epoch: [2] [5220/6500] lr: 0.000020 closs: 0.7165 (0.7528) grad_norm: 0.3760 (0.4576) time: 5.5772 data: 0.0001 max mem: 71357 -[21:38:07.477832] Epoch: [2] [5230/6500] lr: 0.000020 closs: 0.7192 (0.7528) grad_norm: 0.3629 (0.4574) time: 5.5842 data: 0.0001 max mem: 71357 -[21:39:03.327791] Epoch: [2] [5240/6500] lr: 0.000020 closs: 0.7629 (0.7528) grad_norm: 0.4283 (0.4577) time: 5.5844 data: 0.0001 max mem: 71357 -[21:39:59.108460] Epoch: [2] [5250/6500] lr: 0.000020 closs: 0.7569 (0.7529) grad_norm: 0.4310 (0.4577) time: 5.5814 data: 0.0002 max mem: 71357 -[21:40:54.871076] Epoch: [2] [5260/6500] lr: 0.000020 closs: 0.7255 (0.7528) grad_norm: 0.4283 (0.4581) time: 5.5771 data: 0.0002 max mem: 71357 -[21:41:50.589820] Epoch: [2] [5270/6500] lr: 0.000020 closs: 0.7657 (0.7530) grad_norm: 0.4283 (0.4580) time: 5.5740 data: 0.0001 max mem: 71357 -[21:42:46.474046] Epoch: [2] [5280/6500] lr: 0.000020 closs: 0.7840 (0.7530) grad_norm: 0.3961 (0.4581) time: 5.5801 data: 0.0001 max mem: 71357 -[21:43:42.200918] Epoch: [2] [5290/6500] lr: 0.000020 closs: 0.7246 (0.7530) grad_norm: 0.4003 (0.4582) time: 5.5805 data: 0.0001 max mem: 71357 -[21:44:37.997141] Epoch: [2] [5300/6500] lr: 0.000020 closs: 0.7503 (0.7530) grad_norm: 0.3962 (0.4580) time: 5.5760 data: 0.0001 max mem: 71357 -[21:45:33.699151] Epoch: [2] [5310/6500] lr: 0.000020 closs: 0.7254 (0.7531) grad_norm: 0.4213 (0.4580) time: 5.5748 data: 0.0001 max mem: 71357 -[21:46:29.584831] Epoch: [2] [5320/6500] lr: 0.000020 closs: 0.8009 (0.7532) grad_norm: 0.4213 (0.4581) time: 5.5793 data: 0.0001 max mem: 71357 -[21:47:25.383542] Epoch: [2] [5330/6500] lr: 0.000020 closs: 0.8009 (0.7533) grad_norm: 0.4237 (0.4581) time: 5.5841 data: 0.0001 max mem: 71357 -[21:48:21.194631] Epoch: [2] [5340/6500] lr: 0.000020 closs: 0.8154 (0.7535) grad_norm: 0.4372 (0.4580) time: 5.5804 data: 0.0001 max mem: 71357 -[21:49:16.955311] Epoch: [2] [5350/6500] lr: 0.000020 closs: 0.7530 (0.7534) grad_norm: 0.4259 (0.4580) time: 5.5785 data: 0.0001 max mem: 71357 -[21:50:12.677438] Epoch: [2] [5360/6500] lr: 0.000020 closs: 0.7140 (0.7534) grad_norm: 0.4259 (0.4581) time: 5.5741 data: 0.0001 max mem: 71357 -[21:51:08.506115] Epoch: [2] [5370/6500] lr: 0.000020 closs: 0.7140 (0.7534) grad_norm: 0.3761 (0.4579) time: 5.5774 data: 0.0001 max mem: 71357 -[21:52:04.259279] Epoch: [2] [5380/6500] lr: 0.000020 closs: 0.6909 (0.7533) grad_norm: 0.3761 (0.4579) time: 5.5790 data: 0.0001 max mem: 71357 -[21:53:00.007049] Epoch: [2] [5390/6500] lr: 0.000020 closs: 0.7034 (0.7533) grad_norm: 0.3758 (0.4578) time: 5.5750 data: 0.0001 max mem: 71357 -[21:53:55.686501] Epoch: [2] [5400/6500] lr: 0.000020 closs: 0.7651 (0.7533) grad_norm: 0.3841 (0.4578) time: 5.5713 data: 0.0001 max mem: 71357 -[21:54:51.514514] Epoch: [2] [5410/6500] lr: 0.000020 closs: 0.7785 (0.7534) grad_norm: 0.4139 (0.4580) time: 5.5753 data: 0.0001 max mem: 71357 -[21:55:47.154445] Epoch: [2] [5420/6500] lr: 0.000020 closs: 0.7544 (0.7533) grad_norm: 0.4081 (0.4579) time: 5.5733 data: 0.0001 max mem: 71357 -[21:56:42.951537] Epoch: [2] [5430/6500] lr: 0.000020 closs: 0.6845 (0.7532) grad_norm: 0.4150 (0.4579) time: 5.5717 data: 0.0001 max mem: 71357 -[21:57:38.676644] Epoch: [2] [5440/6500] lr: 0.000020 closs: 0.8226 (0.7534) grad_norm: 0.4081 (0.4578) time: 5.5760 data: 0.0001 max mem: 71357 -[21:58:34.461701] Epoch: [2] [5450/6500] lr: 0.000020 closs: 0.7645 (0.7533) grad_norm: 0.4039 (0.4578) time: 5.5754 data: 0.0001 max mem: 71357 -[21:59:30.295020] Epoch: [2] [5460/6500] lr: 0.000020 closs: 0.7645 (0.7535) grad_norm: 0.4783 (0.4581) time: 5.5808 data: 0.0001 max mem: 71357 -[22:00:25.992583] Epoch: [2] [5470/6500] lr: 0.000020 closs: 0.8520 (0.7536) grad_norm: 0.4082 (0.4580) time: 5.5765 data: 0.0001 max mem: 71357 -[22:01:21.691970] Epoch: [2] [5480/6500] lr: 0.000020 closs: 0.7229 (0.7536) grad_norm: 0.4799 (0.4582) time: 5.5697 data: 0.0001 max mem: 71357 -[22:02:17.516596] Epoch: [2] [5490/6500] lr: 0.000020 closs: 0.7057 (0.7536) grad_norm: 0.4082 (0.4581) time: 5.5761 data: 0.0001 max mem: 71357 -[22:03:13.445909] Epoch: [2] [5500/6500] lr: 0.000020 closs: 0.6673 (0.7533) grad_norm: 0.3902 (0.4579) time: 5.5876 data: 0.0001 max mem: 71357 -[22:04:09.147708] Epoch: [2] [5510/6500] lr: 0.000019 closs: 0.6875 (0.7535) grad_norm: 0.3914 (0.4580) time: 5.5814 data: 0.0001 max mem: 71357 -[22:05:04.874061] Epoch: [2] [5520/6500] lr: 0.000019 closs: 0.7620 (0.7535) grad_norm: 0.3816 (0.4579) time: 5.5713 data: 0.0002 max mem: 71357 -[22:06:00.681888] Epoch: [2] [5530/6500] lr: 0.000019 closs: 0.7095 (0.7535) grad_norm: 0.3783 (0.4601) time: 5.5766 data: 0.0002 max mem: 71357 -[22:06:56.443705] Epoch: [2] [5540/6500] lr: 0.000019 closs: 0.7318 (0.7536) grad_norm: 0.4034 (0.4601) time: 5.5784 data: 0.0001 max mem: 71357 -[22:07:52.418745] Epoch: [2] [5550/6500] lr: 0.000019 closs: 0.7458 (0.7538) grad_norm: 0.3594 (0.4599) time: 5.5867 data: 0.0001 max mem: 71357 -[22:08:48.144479] Epoch: [2] [5560/6500] lr: 0.000019 closs: 0.7224 (0.7537) grad_norm: 0.4024 (0.4599) time: 5.5849 data: 0.0001 max mem: 71357 -[22:09:43.897713] Epoch: [2] [5570/6500] lr: 0.000019 closs: 0.7157 (0.7537) grad_norm: 0.3973 (0.4597) time: 5.5739 data: 0.0001 max mem: 71357 -[22:10:39.690994] Epoch: [2] [5580/6500] lr: 0.000019 closs: 0.7680 (0.7539) grad_norm: 0.3898 (0.4596) time: 5.5773 data: 0.0001 max mem: 71357 -[22:11:35.469045] Epoch: [2] [5590/6500] lr: 0.000019 closs: 0.7702 (0.7539) grad_norm: 0.3973 (0.4594) time: 5.5785 data: 0.0001 max mem: 71357 -[22:12:31.188413] Epoch: [2] [5600/6500] lr: 0.000019 closs: 0.7568 (0.7539) grad_norm: 0.3946 (0.4593) time: 5.5747 data: 0.0001 max mem: 71357 -[22:13:26.862699] Epoch: [2] [5610/6500] lr: 0.000019 closs: 0.7371 (0.7539) grad_norm: 0.3946 (0.4594) time: 5.5696 data: 0.0001 max mem: 71357 -[22:14:22.622269] Epoch: [2] [5620/6500] lr: 0.000019 closs: 0.7695 (0.7540) grad_norm: 0.4004 (0.4594) time: 5.5716 data: 0.0001 max mem: 71357 -[22:15:18.453726] Epoch: [2] [5630/6500] lr: 0.000019 closs: 0.7800 (0.7540) grad_norm: 0.3960 (0.4595) time: 5.5795 data: 0.0001 max mem: 71357 -[22:16:14.272135] Epoch: [2] [5640/6500] lr: 0.000019 closs: 0.7343 (0.7540) grad_norm: 0.3889 (0.4594) time: 5.5824 data: 0.0001 max mem: 71357 -[22:17:10.010456] Epoch: [2] [5650/6500] lr: 0.000019 closs: 0.7035 (0.7539) grad_norm: 0.3810 (0.4594) time: 5.5777 data: 0.0001 max mem: 71357 -[22:18:05.780687] Epoch: [2] [5660/6500] lr: 0.000019 closs: 0.7525 (0.7540) grad_norm: 0.3810 (0.4593) time: 5.5753 data: 0.0001 max mem: 71357 -[22:19:01.603251] Epoch: [2] [5670/6500] lr: 0.000019 closs: 0.7833 (0.7539) grad_norm: 0.3969 (0.4594) time: 5.5795 data: 0.0001 max mem: 71357 -[22:19:57.464364] Epoch: [2] [5680/6500] lr: 0.000019 closs: 0.6852 (0.7538) grad_norm: 0.4027 (0.4592) time: 5.5841 data: 0.0002 max mem: 71357 -[22:20:53.232893] Epoch: [2] [5690/6500] lr: 0.000019 closs: 0.7009 (0.7537) grad_norm: 0.3941 (0.4593) time: 5.5814 data: 0.0002 max mem: 71357 -[22:21:49.061197] Epoch: [2] [5700/6500] lr: 0.000019 closs: 0.7428 (0.7538) grad_norm: 0.3807 (0.4591) time: 5.5798 data: 0.0001 max mem: 71357 -[22:22:44.755870] Epoch: [2] [5710/6500] lr: 0.000019 closs: 0.7428 (0.7538) grad_norm: 0.3692 (0.4591) time: 5.5761 data: 0.0001 max mem: 71357 -[22:23:40.624436] Epoch: [2] [5720/6500] lr: 0.000019 closs: 0.6894 (0.7538) grad_norm: 0.3692 (0.4590) time: 5.5780 data: 0.0001 max mem: 71357 -[22:24:36.360822] Epoch: [2] [5730/6500] lr: 0.000019 closs: 0.7312 (0.7538) grad_norm: 0.3719 (0.4599) time: 5.5801 data: 0.0001 max mem: 71357 -[22:25:32.063175] Epoch: [2] [5740/6500] lr: 0.000019 closs: 0.7312 (0.7537) grad_norm: 0.4130 (0.4600) time: 5.5719 data: 0.0001 max mem: 71357 -[22:26:27.774640] Epoch: [2] [5750/6500] lr: 0.000019 closs: 0.6550 (0.7536) grad_norm: 0.4130 (0.4601) time: 5.5706 data: 0.0001 max mem: 71357 -[22:27:23.569992] Epoch: [2] [5760/6500] lr: 0.000019 closs: 0.6720 (0.7535) grad_norm: 0.4130 (0.4600) time: 5.5753 data: 0.0001 max mem: 71357 -[22:28:19.393765] Epoch: [2] [5770/6500] lr: 0.000019 closs: 0.6776 (0.7535) grad_norm: 0.4095 (0.4599) time: 5.5809 data: 0.0001 max mem: 71357 -[22:29:15.058736] Epoch: [2] [5780/6500] lr: 0.000019 closs: 0.7125 (0.7535) grad_norm: 0.3872 (0.4598) time: 5.5743 data: 0.0001 max mem: 71357 -[22:30:10.761069] Epoch: [2] [5790/6500] lr: 0.000019 closs: 0.7571 (0.7536) grad_norm: 0.3880 (0.4604) time: 5.5683 data: 0.0001 max mem: 71357 -[22:31:06.473078] Epoch: [2] [5800/6500] lr: 0.000019 closs: 0.7571 (0.7535) grad_norm: 0.4095 (0.4604) time: 5.5706 data: 0.0001 max mem: 71357 -[22:32:02.241747] Epoch: [2] [5810/6500] lr: 0.000018 closs: 0.7479 (0.7536) grad_norm: 0.4455 (0.4605) time: 5.5740 data: 0.0001 max mem: 71357 -[22:32:57.961342] Epoch: [2] [5820/6500] lr: 0.000018 closs: 0.7479 (0.7535) grad_norm: 0.4557 (0.4606) time: 5.5743 data: 0.0001 max mem: 71357 -[22:33:53.730026] Epoch: [2] [5830/6500] lr: 0.000018 closs: 0.7750 (0.7537) grad_norm: 0.4460 (0.4607) time: 5.5743 data: 0.0001 max mem: 71357 -[22:34:49.531097] Epoch: [2] [5840/6500] lr: 0.000018 closs: 0.7832 (0.7537) grad_norm: 0.4754 (0.4607) time: 5.5784 data: 0.0002 max mem: 71357 -[22:35:45.415785] Epoch: [2] [5850/6500] lr: 0.000018 closs: 0.7469 (0.7538) grad_norm: 0.4186 (0.4606) time: 5.5842 data: 0.0002 max mem: 71357 -[22:36:41.194513] Epoch: [2] [5860/6500] lr: 0.000018 closs: 0.7760 (0.7538) grad_norm: 0.3871 (0.4605) time: 5.5831 data: 0.0001 max mem: 71357 -[22:37:36.993239] Epoch: [2] [5870/6500] lr: 0.000018 closs: 0.7566 (0.7538) grad_norm: 0.3684 (0.4603) time: 5.5788 data: 0.0001 max mem: 71357 -[22:38:32.745756] Epoch: [2] [5880/6500] lr: 0.000018 closs: 0.7442 (0.7537) grad_norm: 0.3798 (0.4603) time: 5.5774 data: 0.0001 max mem: 71357 -[22:39:28.509506] Epoch: [2] [5890/6500] lr: 0.000018 closs: 0.7118 (0.7536) grad_norm: 0.3814 (0.4602) time: 5.5757 data: 0.0001 max mem: 71357 -[22:40:24.433869] Epoch: [2] [5900/6500] lr: 0.000018 closs: 0.6788 (0.7535) grad_norm: 0.3861 (0.4602) time: 5.5843 data: 0.0001 max mem: 71357 -[22:41:20.129147] Epoch: [2] [5910/6500] lr: 0.000018 closs: 0.7175 (0.7536) grad_norm: 0.4095 (0.4603) time: 5.5809 data: 0.0001 max mem: 71357 -[22:42:15.865346] Epoch: [2] [5920/6500] lr: 0.000018 closs: 0.7207 (0.7535) grad_norm: 0.4065 (0.4602) time: 5.5715 data: 0.0001 max mem: 71357 -[22:43:11.634563] Epoch: [2] [5930/6500] lr: 0.000018 closs: 0.7174 (0.7535) grad_norm: 0.4065 (0.4613) time: 5.5752 data: 0.0001 max mem: 71357 -[22:44:07.473739] Epoch: [2] [5940/6500] lr: 0.000018 closs: 0.7479 (0.7535) grad_norm: 0.3961 (0.4614) time: 5.5803 data: 0.0001 max mem: 71357 -[22:45:03.266670] Epoch: [2] [5950/6500] lr: 0.000018 closs: 0.7342 (0.7535) grad_norm: 0.3684 (0.4612) time: 5.5815 data: 0.0002 max mem: 71357 -[22:45:59.020127] Epoch: [2] [5960/6500] lr: 0.000018 closs: 0.6752 (0.7534) grad_norm: 0.3959 (0.4612) time: 5.5772 data: 0.0002 max mem: 71357 -[22:46:54.732305] Epoch: [2] [5970/6500] lr: 0.000018 closs: 0.7144 (0.7534) grad_norm: 0.4040 (0.4613) time: 5.5732 data: 0.0001 max mem: 71357 -[22:47:50.554503] Epoch: [2] [5980/6500] lr: 0.000018 closs: 0.7962 (0.7534) grad_norm: 0.4040 (0.4624) time: 5.5766 data: 0.0001 max mem: 71357 -[22:48:46.306751] Epoch: [2] [5990/6500] lr: 0.000018 closs: 0.7715 (0.7535) grad_norm: 0.3959 (0.4623) time: 5.5786 data: 0.0001 max mem: 71357 -[22:49:41.974220] Epoch: [2] [6000/6500] lr: 0.000018 closs: 0.7402 (0.7536) grad_norm: 0.3846 (0.4624) time: 5.5709 data: 0.0001 max mem: 71357 -[22:50:37.783659] Epoch: [2] [6010/6500] lr: 0.000018 closs: 0.7132 (0.7535) grad_norm: 0.3823 (0.4623) time: 5.5738 data: 0.0001 max mem: 71357 -[22:51:33.639991] Epoch: [2] [6020/6500] lr: 0.000018 closs: 0.7106 (0.7535) grad_norm: 0.3712 (0.4622) time: 5.5832 data: 0.0001 max mem: 71357 -[22:52:29.438869] Epoch: [2] [6030/6500] lr: 0.000018 closs: 0.7124 (0.7535) grad_norm: 0.3909 (0.4623) time: 5.5827 data: 0.0001 max mem: 71357 -[22:53:25.189254] Epoch: [2] [6040/6500] lr: 0.000018 closs: 0.7124 (0.7534) grad_norm: 0.3894 (0.4622) time: 5.5774 data: 0.0001 max mem: 71357 -[22:54:20.915316] Epoch: [2] [6050/6500] lr: 0.000018 closs: 0.7851 (0.7536) grad_norm: 0.3782 (0.4621) time: 5.5737 data: 0.0001 max mem: 71357 -[22:55:16.695819] Epoch: [2] [6060/6500] lr: 0.000018 closs: 0.7867 (0.7536) grad_norm: 0.3782 (0.4621) time: 5.5752 data: 0.0002 max mem: 71357 -[22:56:12.546878] Epoch: [2] [6070/6500] lr: 0.000018 closs: 0.7574 (0.7536) grad_norm: 0.3723 (0.4619) time: 5.5815 data: 0.0002 max mem: 71357 -[22:57:08.327277] Epoch: [2] [6080/6500] lr: 0.000018 closs: 0.6751 (0.7537) grad_norm: 0.3723 (0.4619) time: 5.5815 data: 0.0001 max mem: 71357 -[22:58:04.087779] Epoch: [2] [6090/6500] lr: 0.000018 closs: 0.7851 (0.7536) grad_norm: 0.3784 (0.4620) time: 5.5769 data: 0.0001 max mem: 71357 -[22:58:59.916758] Epoch: [2] [6100/6500] lr: 0.000018 closs: 0.7344 (0.7536) grad_norm: 0.4472 (0.4621) time: 5.5794 data: 0.0001 max mem: 71357 -[22:59:55.759053] Epoch: [2] [6110/6500] lr: 0.000017 closs: 0.7704 (0.7537) grad_norm: 0.4621 (0.4621) time: 5.5834 data: 0.0002 max mem: 71357 -[23:00:51.574760] Epoch: [2] [6120/6500] lr: 0.000017 closs: 0.8100 (0.7537) grad_norm: 0.4472 (0.4620) time: 5.5828 data: 0.0002 max mem: 71357 -[23:01:47.278852] Epoch: [2] [6130/6500] lr: 0.000017 closs: 0.7380 (0.7537) grad_norm: 0.4204 (0.4620) time: 5.5759 data: 0.0001 max mem: 71357 -[23:02:43.031933] Epoch: [2] [6140/6500] lr: 0.000017 closs: 0.7306 (0.7537) grad_norm: 0.4444 (0.4623) time: 5.5728 data: 0.0001 max mem: 71357 -[23:03:38.769960] Epoch: [2] [6150/6500] lr: 0.000017 closs: 0.7840 (0.7539) grad_norm: 0.4829 (0.4623) time: 5.5745 data: 0.0001 max mem: 71357 -[23:04:34.600484] Epoch: [2] [6160/6500] lr: 0.000017 closs: 0.7840 (0.7539) grad_norm: 0.4829 (0.4622) time: 5.5783 data: 0.0001 max mem: 71357 -[23:05:30.286996] Epoch: [2] [6170/6500] lr: 0.000017 closs: 0.7545 (0.7540) grad_norm: 0.4356 (0.4622) time: 5.5758 data: 0.0001 max mem: 71357 -[23:06:25.993506] Epoch: [2] [6180/6500] lr: 0.000017 closs: 0.7504 (0.7539) grad_norm: 0.4211 (0.4621) time: 5.5696 data: 0.0001 max mem: 71357 -[23:07:21.647010] Epoch: [2] [6190/6500] lr: 0.000017 closs: 0.7039 (0.7539) grad_norm: 0.4211 (0.4628) time: 5.5679 data: 0.0001 max mem: 71357 -[23:08:17.362028] Epoch: [2] [6200/6500] lr: 0.000017 closs: 0.6685 (0.7538) grad_norm: 0.4351 (0.4630) time: 5.5684 data: 0.0001 max mem: 71357 -[23:09:13.186215] Epoch: [2] [6210/6500] lr: 0.000017 closs: 0.7096 (0.7538) grad_norm: 0.4211 (0.4629) time: 5.5769 data: 0.0001 max mem: 71357 -[23:10:08.928622] Epoch: [2] [6220/6500] lr: 0.000017 closs: 0.7746 (0.7537) grad_norm: 0.4352 (0.4630) time: 5.5782 data: 0.0001 max mem: 71357 -[23:11:04.652755] Epoch: [2] [6230/6500] lr: 0.000017 closs: 0.7821 (0.7538) grad_norm: 0.4103 (0.4630) time: 5.5732 data: 0.0001 max mem: 71357 -[23:12:00.419833] Epoch: [2] [6240/6500] lr: 0.000017 closs: 0.7860 (0.7538) grad_norm: 0.4103 (0.4630) time: 5.5745 data: 0.0001 max mem: 71357 -[23:12:56.282491] Epoch: [2] [6250/6500] lr: 0.000017 closs: 0.7497 (0.7538) grad_norm: 0.4111 (0.4630) time: 5.5814 data: 0.0001 max mem: 71357 -[23:13:52.002571] Epoch: [2] [6260/6500] lr: 0.000017 closs: 0.6927 (0.7537) grad_norm: 0.4111 (0.4630) time: 5.5791 data: 0.0001 max mem: 71357 -[23:14:47.790512] Epoch: [2] [6270/6500] lr: 0.000017 closs: 0.6601 (0.7535) grad_norm: 0.4325 (0.4631) time: 5.5753 data: 0.0001 max mem: 71357 -[23:15:43.475077] Epoch: [2] [6280/6500] lr: 0.000017 closs: 0.7613 (0.7537) grad_norm: 0.4325 (0.4631) time: 5.5735 data: 0.0001 max mem: 71357 -[23:16:39.282617] Epoch: [2] [6290/6500] lr: 0.000017 closs: 0.7696 (0.7537) grad_norm: 0.4325 (0.4631) time: 5.5745 data: 0.0001 max mem: 71357 -[23:17:34.943611] Epoch: [2] [6300/6500] lr: 0.000017 closs: 0.6945 (0.7536) grad_norm: 0.4325 (0.4631) time: 5.5733 data: 0.0001 max mem: 71357 -[23:18:30.788632] Epoch: [2] [6310/6500] lr: 0.000017 closs: 0.6751 (0.7535) grad_norm: 0.3664 (0.4629) time: 5.5752 data: 0.0001 max mem: 71357 -[23:19:26.585278] Epoch: [2] [6320/6500] lr: 0.000017 closs: 0.7299 (0.7535) grad_norm: 0.3616 (0.4629) time: 5.5820 data: 0.0001 max mem: 71357 -[23:20:22.397220] Epoch: [2] [6330/6500] lr: 0.000017 closs: 0.7299 (0.7535) grad_norm: 0.3927 (0.4630) time: 5.5803 data: 0.0002 max mem: 71357 -[23:21:18.332983] Epoch: [2] [6340/6500] lr: 0.000017 closs: 0.7740 (0.7536) grad_norm: 0.3840 (0.4629) time: 5.5873 data: 0.0002 max mem: 71357 -[23:22:14.017952] Epoch: [2] [6350/6500] lr: 0.000017 closs: 0.7530 (0.7536) grad_norm: 0.4238 (0.4629) time: 5.5809 data: 0.0002 max mem: 71357 -[23:23:09.970496] Epoch: [2] [6360/6500] lr: 0.000017 closs: 0.6566 (0.7535) grad_norm: 0.3840 (0.4627) time: 5.5818 data: 0.0002 max mem: 71357 -[23:24:05.875075] Epoch: [2] [6370/6500] lr: 0.000017 closs: 0.6932 (0.7535) grad_norm: 0.3840 (0.4627) time: 5.5928 data: 0.0002 max mem: 71357 -[23:25:01.751484] Epoch: [2] [6380/6500] lr: 0.000017 closs: 0.6841 (0.7535) grad_norm: 0.4111 (0.4626) time: 5.5890 data: 0.0002 max mem: 71357 -[23:25:57.533484] Epoch: [2] [6390/6500] lr: 0.000017 closs: 0.7375 (0.7535) grad_norm: 0.3716 (0.4625) time: 5.5828 data: 0.0002 max mem: 71357 -[23:26:53.655347] Epoch: [2] [6400/6500] lr: 0.000017 closs: 0.7540 (0.7535) grad_norm: 0.4111 (0.4624) time: 5.5946 data: 0.0002 max mem: 71357 -[23:27:49.479003] Epoch: [2] [6410/6500] lr: 0.000017 closs: 0.6857 (0.7535) grad_norm: 0.3663 (0.4622) time: 5.5967 data: 0.0002 max mem: 71357 -[23:28:45.243961] Epoch: [2] [6420/6500] lr: 0.000017 closs: 0.7293 (0.7535) grad_norm: 0.3680 (0.4622) time: 5.5794 data: 0.0002 max mem: 71357 -[23:29:41.165348] Epoch: [2] [6430/6500] lr: 0.000016 closs: 0.7293 (0.7536) grad_norm: 0.3680 (0.4621) time: 5.5842 data: 0.0001 max mem: 71357 -[23:30:37.000964] Epoch: [2] [6440/6500] lr: 0.000016 closs: 0.6909 (0.7535) grad_norm: 0.3927 (0.4620) time: 5.5878 data: 0.0001 max mem: 71357 -[23:31:32.684603] Epoch: [2] [6450/6500] lr: 0.000016 closs: 0.7128 (0.7534) grad_norm: 0.3942 (0.4621) time: 5.5759 data: 0.0001 max mem: 71357 -[23:32:28.371373] Epoch: [2] [6460/6500] lr: 0.000016 closs: 0.7128 (0.7534) grad_norm: 0.3942 (0.4621) time: 5.5685 data: 0.0001 max mem: 71357 -[23:33:24.189082] Epoch: [2] [6470/6500] lr: 0.000016 closs: 0.6997 (0.7533) grad_norm: 0.3942 (0.4619) time: 5.5752 data: 0.0001 max mem: 71357 -[23:34:19.829473] Epoch: [2] [6480/6500] lr: 0.000016 closs: 0.6889 (0.7533) grad_norm: 0.3942 (0.4619) time: 5.5728 data: 0.0001 max mem: 71357 -[23:35:15.552398] Epoch: [2] [6490/6500] lr: 0.000016 closs: 0.7060 (0.7532) grad_norm: 0.3979 (0.4619) time: 5.5681 data: 0.0001 max mem: 71357 -[23:36:06.410367] Epoch: [2] Total time: 10:04:27 -[23:36:06.411251] Averaged stats: lr: 0.000016 closs: 0.7366 (0.7503) grad_norm: 0.3976 (0.4619) -[23:36:06.578363] model saved -[23:36:07.582844] optimizer saved -[23:36:07.583305] other rank-common saved -[23:36:07.586540] rank-specific saved -[23:36:07.595987] log_dir: ./output_dir -[23:36:15.789670] Epoch: [3] [0/6500] lr: 0.000016 closs: 0.6146 (0.6146) time: 8.1929 data: 2.6079 max mem: 71357 -[23:37:11.511729] Epoch: [3] [10/6500] lr: 0.000016 closs: 0.7006 (0.7128) grad_norm: 0.4318 (0.4230) time: 5.8104 data: 0.2372 max mem: 71357 -[23:38:07.453950] Epoch: [3] [20/6500] lr: 0.000016 closs: 0.7074 (0.7354) grad_norm: 0.3599 (0.4300) time: 5.5831 data: 0.0001 max mem: 71357 -[23:39:03.188210] Epoch: [3] [30/6500] lr: 0.000016 closs: 0.7432 (0.7265) grad_norm: 0.3704 (0.4030) time: 5.5837 data: 0.0001 max mem: 71357 -[23:39:58.904121] Epoch: [3] [40/6500] lr: 0.000016 closs: 0.7133 (0.7253) grad_norm: 0.3738 (0.4072) time: 5.5724 data: 0.0001 max mem: 71357 -[23:40:54.597672] Epoch: [3] [50/6500] lr: 0.000016 closs: 0.7198 (0.7230) grad_norm: 0.3704 (0.4126) time: 5.5704 data: 0.0001 max mem: 71357 -[23:41:50.339082] Epoch: [3] [60/6500] lr: 0.000016 closs: 0.7353 (0.7339) grad_norm: 0.3956 (0.4189) time: 5.5716 data: 0.0001 max mem: 71357 -[23:42:46.233960] Epoch: [3] [70/6500] lr: 0.000016 closs: 0.8521 (0.7525) grad_norm: 0.3956 (0.4162) time: 5.5817 data: 0.0001 max mem: 71357 -[23:43:41.956645] Epoch: [3] [80/6500] lr: 0.000016 closs: 0.7817 (0.7474) grad_norm: 0.3673 (0.4140) time: 5.5808 data: 0.0001 max mem: 71357 -[23:44:37.633777] Epoch: [3] [90/6500] lr: 0.000016 closs: 0.6593 (0.7440) grad_norm: 0.3729 (0.4156) time: 5.5699 data: 0.0001 max mem: 71357 -[23:45:33.361270] Epoch: [3] [100/6500] lr: 0.000016 closs: 0.6504 (0.7386) grad_norm: 0.3673 (0.4159) time: 5.5702 data: 0.0001 max mem: 71357 -[23:46:29.167607] Epoch: [3] [110/6500] lr: 0.000016 closs: 0.7165 (0.7374) grad_norm: 0.3846 (0.4321) time: 5.5766 data: 0.0001 max mem: 71357 -[23:47:25.114397] Epoch: [3] [120/6500] lr: 0.000016 closs: 0.7398 (0.7371) grad_norm: 0.4167 (0.4296) time: 5.5876 data: 0.0001 max mem: 71357 -[23:48:20.766418] Epoch: [3] [130/6500] lr: 0.000016 closs: 0.7793 (0.7405) grad_norm: 0.4167 (0.4313) time: 5.5798 data: 0.0001 max mem: 71357 -[23:49:16.432372] Epoch: [3] [140/6500] lr: 0.000016 closs: 0.7152 (0.7344) grad_norm: 0.3969 (0.4370) time: 5.5658 data: 0.0001 max mem: 71357 -[23:50:12.174575] Epoch: [3] [150/6500] lr: 0.000016 closs: 0.7152 (0.7377) grad_norm: 0.3969 (0.4372) time: 5.5703 data: 0.0001 max mem: 71357 -[23:51:08.059066] Epoch: [3] [160/6500] lr: 0.000016 closs: 0.7234 (0.7331) grad_norm: 0.3990 (0.4364) time: 5.5813 data: 0.0001 max mem: 71357 -[23:52:03.804257] Epoch: [3] [170/6500] lr: 0.000016 closs: 0.6501 (0.7310) grad_norm: 0.4371 (0.4405) time: 5.5814 data: 0.0002 max mem: 71357 -[23:52:59.500243] Epoch: [3] [180/6500] lr: 0.000016 closs: 0.7634 (0.7355) grad_norm: 0.4371 (0.4403) time: 5.5719 data: 0.0002 max mem: 71357 -[23:53:55.305535] Epoch: [3] [190/6500] lr: 0.000016 closs: 0.7560 (0.7352) grad_norm: 0.4015 (0.4372) time: 5.5749 data: 0.0001 max mem: 71357 -[23:54:51.144563] Epoch: [3] [200/6500] lr: 0.000016 closs: 0.7020 (0.7311) grad_norm: 0.4015 (0.4360) time: 5.5821 data: 0.0001 max mem: 71357 -[23:55:46.998394] Epoch: [3] [210/6500] lr: 0.000016 closs: 0.7057 (0.7326) grad_norm: 0.3800 (0.4397) time: 5.5846 data: 0.0001 max mem: 71357 -[23:56:42.733975] Epoch: [3] [220/6500] lr: 0.000016 closs: 0.7198 (0.7340) grad_norm: 0.3790 (0.4489) time: 5.5794 data: 0.0001 max mem: 71357 -[23:57:38.456914] Epoch: [3] [230/6500] lr: 0.000016 closs: 0.6877 (0.7321) grad_norm: 0.3800 (0.4482) time: 5.5729 data: 0.0001 max mem: 71357 -[23:58:34.352787] Epoch: [3] [240/6500] lr: 0.000016 closs: 0.7389 (0.7350) grad_norm: 0.3748 (0.4443) time: 5.5809 data: 0.0001 max mem: 71357 -[23:59:30.047396] Epoch: [3] [250/6500] lr: 0.000015 closs: 0.7643 (0.7352) grad_norm: 0.3739 (0.4411) time: 5.5794 data: 0.0001 max mem: 71357 -[00:00:25.716985] Epoch: [3] [260/6500] lr: 0.000015 closs: 0.7649 (0.7387) grad_norm: 0.3739 (0.4420) time: 5.5681 data: 0.0001 max mem: 71357 -[00:01:21.437902] Epoch: [3] [270/6500] lr: 0.000015 closs: 0.7609 (0.7394) grad_norm: 0.3747 (0.4463) time: 5.5695 data: 0.0001 max mem: 71357 -[00:02:17.206931] Epoch: [3] [280/6500] lr: 0.000015 closs: 0.6993 (0.7386) grad_norm: 0.3980 (0.4447) time: 5.5744 data: 0.0001 max mem: 71357 -[00:03:12.921880] Epoch: [3] [290/6500] lr: 0.000015 closs: 0.7272 (0.7393) grad_norm: 0.4024 (0.4441) time: 5.5741 data: 0.0001 max mem: 71357 -[00:04:08.682771] Epoch: [3] [300/6500] lr: 0.000015 closs: 0.7557 (0.7407) grad_norm: 0.4064 (0.4482) time: 5.5737 data: 0.0001 max mem: 71357 -[00:05:04.350866] Epoch: [3] [310/6500] lr: 0.000015 closs: 0.7481 (0.7401) grad_norm: 0.4064 (0.4465) time: 5.5713 data: 0.0001 max mem: 71357 -[00:06:00.086395] Epoch: [3] [320/6500] lr: 0.000015 closs: 0.7440 (0.7390) grad_norm: 0.3864 (0.4437) time: 5.5701 data: 0.0001 max mem: 71357 -[00:06:55.945079] Epoch: [3] [330/6500] lr: 0.000015 closs: 0.7742 (0.7414) grad_norm: 0.3987 (0.4443) time: 5.5796 data: 0.0001 max mem: 71357 -[00:07:51.657783] Epoch: [3] [340/6500] lr: 0.000015 closs: 0.7810 (0.7444) grad_norm: 0.3864 (0.4482) time: 5.5785 data: 0.0001 max mem: 71357 -[00:08:47.398301] Epoch: [3] [350/6500] lr: 0.000015 closs: 0.7696 (0.7429) grad_norm: 0.3989 (0.4466) time: 5.5725 data: 0.0001 max mem: 71357 -[00:09:43.161770] Epoch: [3] [360/6500] lr: 0.000015 closs: 0.7516 (0.7433) grad_norm: 0.4093 (0.4457) time: 5.5751 data: 0.0001 max mem: 71357 -[00:10:38.853854] Epoch: [3] [370/6500] lr: 0.000015 closs: 0.7676 (0.7444) grad_norm: 0.4122 (0.4464) time: 5.5727 data: 0.0001 max mem: 71357 -[00:11:34.646934] Epoch: [3] [380/6500] lr: 0.000015 closs: 0.7893 (0.7451) grad_norm: 0.3989 (0.4464) time: 5.5742 data: 0.0001 max mem: 71357 -[00:12:30.405911] Epoch: [3] [390/6500] lr: 0.000015 closs: 0.8063 (0.7453) grad_norm: 0.4085 (0.4461) time: 5.5775 data: 0.0001 max mem: 71357 -[00:13:26.046470] Epoch: [3] [400/6500] lr: 0.000015 closs: 0.7670 (0.7463) grad_norm: 0.4090 (0.4495) time: 5.5699 data: 0.0001 max mem: 71357 -[00:14:21.747482] Epoch: [3] [410/6500] lr: 0.000015 closs: 0.7218 (0.7445) grad_norm: 0.3901 (0.4504) time: 5.5670 data: 0.0001 max mem: 71357 -[00:15:17.668173] Epoch: [3] [420/6500] lr: 0.000015 closs: 0.6873 (0.7438) grad_norm: 0.3901 (0.4491) time: 5.5810 data: 0.0001 max mem: 71357 -[00:16:13.463723] Epoch: [3] [430/6500] lr: 0.000015 closs: 0.7053 (0.7430) grad_norm: 0.3729 (0.4473) time: 5.5857 data: 0.0001 max mem: 71357 -[00:17:09.143060] Epoch: [3] [440/6500] lr: 0.000015 closs: 0.7145 (0.7429) grad_norm: 0.3853 (0.4475) time: 5.5737 data: 0.0001 max mem: 71357 -[00:18:04.852034] Epoch: [3] [450/6500] lr: 0.000015 closs: 0.7834 (0.7436) grad_norm: 0.3907 (0.4470) time: 5.5693 data: 0.0001 max mem: 71357 -[00:19:00.721499] Epoch: [3] [460/6500] lr: 0.000015 closs: 0.7946 (0.7431) grad_norm: 0.3853 (0.4447) time: 5.5788 data: 0.0001 max mem: 71357 -[00:19:56.460552] Epoch: [3] [470/6500] lr: 0.000015 closs: 0.7314 (0.7425) grad_norm: 0.3587 (0.4440) time: 5.5803 data: 0.0001 max mem: 71357 -[00:20:52.121137] Epoch: [3] [480/6500] lr: 0.000015 closs: 0.7648 (0.7444) grad_norm: 0.3587 (0.4444) time: 5.5699 data: 0.0001 max mem: 71357 -[00:21:47.795418] Epoch: [3] [490/6500] lr: 0.000015 closs: 0.7545 (0.7415) grad_norm: 0.3587 (0.4470) time: 5.5667 data: 0.0001 max mem: 71357 -[00:22:43.610741] Epoch: [3] [500/6500] lr: 0.000015 closs: 0.6561 (0.7423) grad_norm: 0.3719 (0.4456) time: 5.5744 data: 0.0001 max mem: 71357 -[00:23:39.289884] Epoch: [3] [510/6500] lr: 0.000015 closs: 0.7800 (0.7433) grad_norm: 0.4030 (0.4467) time: 5.5746 data: 0.0001 max mem: 71357 -[00:24:34.955681] Epoch: [3] [520/6500] lr: 0.000015 closs: 0.7182 (0.7408) grad_norm: 0.3815 (0.4456) time: 5.5672 data: 0.0001 max mem: 71357 -[00:25:30.752698] Epoch: [3] [530/6500] lr: 0.000015 closs: 0.7124 (0.7414) grad_norm: 0.3815 (0.4451) time: 5.5730 data: 0.0001 max mem: 71357 -[00:26:26.458444] Epoch: [3] [540/6500] lr: 0.000015 closs: 0.7216 (0.7405) grad_norm: 0.3815 (0.4445) time: 5.5750 data: 0.0001 max mem: 71357 -[00:27:22.282296] Epoch: [3] [550/6500] lr: 0.000015 closs: 0.7084 (0.7406) grad_norm: 0.3815 (0.4461) time: 5.5764 data: 0.0001 max mem: 71357 -[00:28:17.976928] Epoch: [3] [560/6500] lr: 0.000015 closs: 0.7254 (0.7403) grad_norm: 0.4211 (0.4493) time: 5.5759 data: 0.0001 max mem: 71357 -[00:29:13.811494] Epoch: [3] [570/6500] lr: 0.000015 closs: 0.6689 (0.7397) grad_norm: 0.3693 (0.4479) time: 5.5764 data: 0.0001 max mem: 71357 -[00:30:09.537691] Epoch: [3] [580/6500] lr: 0.000014 closs: 0.6779 (0.7394) grad_norm: 0.4004 (0.4494) time: 5.5779 data: 0.0001 max mem: 71357 -[00:31:05.357551] Epoch: [3] [590/6500] lr: 0.000014 closs: 0.7062 (0.7387) grad_norm: 0.4039 (0.4512) time: 5.5772 data: 0.0001 max mem: 71357 -[00:32:01.086703] Epoch: [3] [600/6500] lr: 0.000014 closs: 0.6869 (0.7392) grad_norm: 0.4087 (0.4528) time: 5.5774 data: 0.0001 max mem: 71357 -[00:32:56.729576] Epoch: [3] [610/6500] lr: 0.000014 closs: 0.6827 (0.7380) grad_norm: 0.5076 (0.4546) time: 5.5685 data: 0.0001 max mem: 71357 -[00:33:52.466776] Epoch: [3] [620/6500] lr: 0.000014 closs: 0.6962 (0.7388) grad_norm: 0.5076 (0.4552) time: 5.5689 data: 0.0001 max mem: 71357 -[00:34:48.197321] Epoch: [3] [630/6500] lr: 0.000014 closs: 0.7863 (0.7393) grad_norm: 0.5055 (0.4563) time: 5.5733 data: 0.0001 max mem: 71357 -[00:35:43.932519] Epoch: [3] [640/6500] lr: 0.000014 closs: 0.7812 (0.7391) grad_norm: 0.4378 (0.4563) time: 5.5732 data: 0.0001 max mem: 71357 -[00:36:39.649858] Epoch: [3] [650/6500] lr: 0.000014 closs: 0.7366 (0.7392) grad_norm: 0.4040 (0.4562) time: 5.5725 data: 0.0001 max mem: 71357 -[00:37:35.318846] Epoch: [3] [660/6500] lr: 0.000014 closs: 0.6963 (0.7391) grad_norm: 0.4330 (0.4559) time: 5.5692 data: 0.0001 max mem: 71357 -[00:38:30.992475] Epoch: [3] [670/6500] lr: 0.000014 closs: 0.7026 (0.7402) grad_norm: 0.4156 (0.4553) time: 5.5670 data: 0.0001 max mem: 71357 -[00:39:26.871070] Epoch: [3] [680/6500] lr: 0.000014 closs: 0.7783 (0.7407) grad_norm: 0.4330 (0.4568) time: 5.5775 data: 0.0001 max mem: 71357 -[00:40:22.585682] Epoch: [3] [690/6500] lr: 0.000014 closs: 0.7404 (0.7394) grad_norm: 0.4211 (0.4559) time: 5.5795 data: 0.0001 max mem: 71357 -[00:41:18.283696] Epoch: [3] [700/6500] lr: 0.000014 closs: 0.7099 (0.7398) grad_norm: 0.4050 (0.4552) time: 5.5705 data: 0.0001 max mem: 71357 -[00:42:14.020162] Epoch: [3] [710/6500] lr: 0.000014 closs: 0.7389 (0.7400) grad_norm: 0.4050 (0.4566) time: 5.5716 data: 0.0002 max mem: 71357 -[00:43:09.826888] Epoch: [3] [720/6500] lr: 0.000014 closs: 0.7389 (0.7399) grad_norm: 0.3828 (0.4553) time: 5.5771 data: 0.0002 max mem: 71357 -[00:44:05.589010] Epoch: [3] [730/6500] lr: 0.000014 closs: 0.7528 (0.7405) grad_norm: 0.4084 (0.4581) time: 5.5784 data: 0.0001 max mem: 71357 -[00:45:01.263164] Epoch: [3] [740/6500] lr: 0.000014 closs: 0.7142 (0.7401) grad_norm: 0.4105 (0.4583) time: 5.5717 data: 0.0001 max mem: 71357 -[00:45:57.041552] Epoch: [3] [750/6500] lr: 0.000014 closs: 0.7161 (0.7410) grad_norm: 0.4105 (0.4583) time: 5.5725 data: 0.0001 max mem: 71357 -[00:46:52.780365] Epoch: [3] [760/6500] lr: 0.000014 closs: 0.7962 (0.7404) grad_norm: 0.4450 (0.4576) time: 5.5758 data: 0.0001 max mem: 71357 -[00:47:48.538095] Epoch: [3] [770/6500] lr: 0.000014 closs: 0.7276 (0.7410) grad_norm: 0.4256 (0.4577) time: 5.5747 data: 0.0001 max mem: 71357 -[00:48:44.243706] Epoch: [3] [780/6500] lr: 0.000014 closs: 0.7184 (0.7401) grad_norm: 0.4427 (0.4580) time: 5.5731 data: 0.0001 max mem: 71357 -[00:49:39.903887] Epoch: [3] [790/6500] lr: 0.000014 closs: 0.6724 (0.7390) grad_norm: 0.4518 (0.4593) time: 5.5682 data: 0.0001 max mem: 71357 -[00:50:35.705245] Epoch: [3] [800/6500] lr: 0.000014 closs: 0.6724 (0.7385) grad_norm: 0.4518 (0.4585) time: 5.5730 data: 0.0001 max mem: 71357 -[00:51:31.514197] Epoch: [3] [810/6500] lr: 0.000014 closs: 0.6940 (0.7379) grad_norm: 0.4427 (0.4582) time: 5.5804 data: 0.0001 max mem: 71357 -[00:52:27.324671] Epoch: [3] [820/6500] lr: 0.000014 closs: 0.7296 (0.7380) grad_norm: 0.4293 (0.4602) time: 5.5809 data: 0.0001 max mem: 71357 -[00:53:23.091518] Epoch: [3] [830/6500] lr: 0.000014 closs: 0.6945 (0.7383) grad_norm: 0.4142 (0.4598) time: 5.5788 data: 0.0001 max mem: 71357 -[00:54:18.874645] Epoch: [3] [840/6500] lr: 0.000014 closs: 0.7030 (0.7381) grad_norm: 0.3834 (0.4588) time: 5.5774 data: 0.0001 max mem: 71357 -[00:55:14.607446] Epoch: [3] [850/6500] lr: 0.000014 closs: 0.7228 (0.7374) grad_norm: 0.3834 (0.4594) time: 5.5757 data: 0.0001 max mem: 71357 -[00:56:10.454988] Epoch: [3] [860/6500] lr: 0.000014 closs: 0.7877 (0.7379) grad_norm: 0.3806 (0.4584) time: 5.5789 data: 0.0001 max mem: 71357 -[00:57:06.203850] Epoch: [3] [870/6500] lr: 0.000014 closs: 0.7904 (0.7391) grad_norm: 0.3834 (0.4584) time: 5.5797 data: 0.0002 max mem: 71357 -[00:58:01.896877] Epoch: [3] [880/6500] lr: 0.000014 closs: 0.7406 (0.7385) grad_norm: 0.4107 (0.4582) time: 5.5720 data: 0.0002 max mem: 71357 -[00:58:57.712138] Epoch: [3] [890/6500] lr: 0.000014 closs: 0.7278 (0.7389) grad_norm: 0.3963 (0.4577) time: 5.5753 data: 0.0001 max mem: 71357 -[00:59:53.613174] Epoch: [3] [900/6500] lr: 0.000014 closs: 0.7661 (0.7382) grad_norm: 0.4107 (0.4574) time: 5.5857 data: 0.0001 max mem: 71357 -[01:00:49.265294] Epoch: [3] [910/6500] lr: 0.000014 closs: 0.7247 (0.7374) grad_norm: 0.4342 (0.4595) time: 5.5776 data: 0.0001 max mem: 71357 -[01:01:44.937344] Epoch: [3] [920/6500] lr: 0.000013 closs: 0.6816 (0.7375) grad_norm: 0.4522 (0.4598) time: 5.5661 data: 0.0001 max mem: 71357 -[01:02:40.641403] Epoch: [3] [930/6500] lr: 0.000013 closs: 0.7064 (0.7377) grad_norm: 0.4679 (0.4599) time: 5.5687 data: 0.0002 max mem: 71357 -[01:03:36.455103] Epoch: [3] [940/6500] lr: 0.000013 closs: 0.6902 (0.7376) grad_norm: 0.4658 (0.4596) time: 5.5758 data: 0.0002 max mem: 71357 -[01:04:32.206716] Epoch: [3] [950/6500] lr: 0.000013 closs: 0.7056 (0.7373) grad_norm: 0.4290 (0.4587) time: 5.5781 data: 0.0001 max mem: 71357 -[01:05:27.962419] Epoch: [3] [960/6500] lr: 0.000013 closs: 0.7121 (0.7371) grad_norm: 0.4165 (0.4583) time: 5.5753 data: 0.0001 max mem: 71357 -[01:06:23.683254] Epoch: [3] [970/6500] lr: 0.000013 closs: 0.7357 (0.7372) grad_norm: 0.3924 (0.4586) time: 5.5737 data: 0.0001 max mem: 71357 -[01:07:19.353981] Epoch: [3] [980/6500] lr: 0.000013 closs: 0.7258 (0.7370) grad_norm: 0.3924 (0.4584) time: 5.5695 data: 0.0002 max mem: 71357 -[01:08:15.129240] Epoch: [3] [990/6500] lr: 0.000013 closs: 0.7258 (0.7366) grad_norm: 0.4066 (0.4590) time: 5.5722 data: 0.0002 max mem: 71357 -[01:09:10.838789] Epoch: [3] [1000/6500] lr: 0.000013 closs: 0.7418 (0.7373) grad_norm: 0.3735 (0.4601) time: 5.5741 data: 0.0001 max mem: 71357 -[01:10:06.563829] Epoch: [3] [1010/6500] lr: 0.000013 closs: 0.7381 (0.7374) grad_norm: 0.3834 (0.4599) time: 5.5716 data: 0.0001 max mem: 71357 -[01:11:02.258175] Epoch: [3] [1020/6500] lr: 0.000013 closs: 0.7057 (0.7374) grad_norm: 0.4172 (0.4609) time: 5.5709 data: 0.0001 max mem: 71357 -[01:11:58.103183] Epoch: [3] [1030/6500] lr: 0.000013 closs: 0.7993 (0.7380) grad_norm: 0.4857 (0.4617) time: 5.5769 data: 0.0001 max mem: 71357 -[01:12:53.782290] Epoch: [3] [1040/6500] lr: 0.000013 closs: 0.7572 (0.7380) grad_norm: 0.4870 (0.4617) time: 5.5761 data: 0.0001 max mem: 71357 -[01:13:49.475418] Epoch: [3] [1050/6500] lr: 0.000013 closs: 0.7557 (0.7383) grad_norm: 0.4931 (0.4618) time: 5.5685 data: 0.0001 max mem: 71357 -[01:14:45.267483] Epoch: [3] [1060/6500] lr: 0.000013 closs: 0.7168 (0.7382) grad_norm: 0.4857 (0.4613) time: 5.5742 data: 0.0001 max mem: 71357 -[01:15:41.058202] Epoch: [3] [1070/6500] lr: 0.000013 closs: 0.7480 (0.7385) grad_norm: 0.4244 (0.4607) time: 5.5791 data: 0.0001 max mem: 71357 -[01:16:36.972671] Epoch: [3] [1080/6500] lr: 0.000013 closs: 0.7299 (0.7387) grad_norm: 0.3912 (0.4601) time: 5.5852 data: 0.0001 max mem: 71357 -[01:17:32.590345] Epoch: [3] [1090/6500] lr: 0.000013 closs: 0.6990 (0.7382) grad_norm: 0.3655 (0.4591) time: 5.5765 data: 0.0001 max mem: 71357 -[01:18:28.253054] Epoch: [3] [1100/6500] lr: 0.000013 closs: 0.6832 (0.7377) grad_norm: 0.3655 (0.4589) time: 5.5639 data: 0.0001 max mem: 71357 -[01:19:23.978321] Epoch: [3] [1110/6500] lr: 0.000013 closs: 0.7671 (0.7381) grad_norm: 0.3668 (0.4587) time: 5.5693 data: 0.0001 max mem: 71357 -[01:20:19.761158] Epoch: [3] [1120/6500] lr: 0.000013 closs: 0.7495 (0.7377) grad_norm: 0.3668 (0.4587) time: 5.5753 data: 0.0001 max mem: 71357 -[01:21:15.337249] Epoch: [3] [1130/6500] lr: 0.000013 closs: 0.7288 (0.7378) grad_norm: 0.4154 (0.4584) time: 5.5679 data: 0.0001 max mem: 71357 -[01:22:11.125607] Epoch: [3] [1140/6500] lr: 0.000013 closs: 0.6722 (0.7376) grad_norm: 0.3753 (0.4582) time: 5.5682 data: 0.0001 max mem: 71357 -[01:23:06.785225] Epoch: [3] [1150/6500] lr: 0.000013 closs: 0.6987 (0.7377) grad_norm: 0.3753 (0.4577) time: 5.5723 data: 0.0001 max mem: 71357 -[01:24:02.527031] Epoch: [3] [1160/6500] lr: 0.000013 closs: 0.7018 (0.7375) grad_norm: 0.4078 (0.4575) time: 5.5700 data: 0.0001 max mem: 71357 -[01:24:58.344508] Epoch: [3] [1170/6500] lr: 0.000013 closs: 0.7331 (0.7380) grad_norm: 0.3751 (0.4578) time: 5.5779 data: 0.0001 max mem: 71357 -[01:25:53.986697] Epoch: [3] [1180/6500] lr: 0.000013 closs: 0.7686 (0.7380) grad_norm: 0.4078 (0.4579) time: 5.5729 data: 0.0001 max mem: 71357 -[01:26:49.704392] Epoch: [3] [1190/6500] lr: 0.000013 closs: 0.6941 (0.7379) grad_norm: 0.4060 (0.4574) time: 5.5679 data: 0.0001 max mem: 71357 -[01:27:45.361863] Epoch: [3] [1200/6500] lr: 0.000013 closs: 0.7082 (0.7385) grad_norm: 0.4060 (0.4577) time: 5.5686 data: 0.0001 max mem: 71357 -[01:28:41.177916] Epoch: [3] [1210/6500] lr: 0.000013 closs: 0.7895 (0.7388) grad_norm: 0.4182 (0.4589) time: 5.5736 data: 0.0001 max mem: 71357 -[01:29:36.933677] Epoch: [3] [1220/6500] lr: 0.000013 closs: 0.7895 (0.7386) grad_norm: 0.4473 (0.4588) time: 5.5785 data: 0.0001 max mem: 71357 -[01:30:32.611778] Epoch: [3] [1230/6500] lr: 0.000013 closs: 0.6651 (0.7379) grad_norm: 0.4473 (0.4584) time: 5.5716 data: 0.0001 max mem: 71357 -[01:31:28.246111] Epoch: [3] [1240/6500] lr: 0.000013 closs: 0.7076 (0.7379) grad_norm: 0.4307 (0.4586) time: 5.5656 data: 0.0001 max mem: 71357 -[01:32:24.161221] Epoch: [3] [1250/6500] lr: 0.000013 closs: 0.7297 (0.7382) grad_norm: 0.4281 (0.4581) time: 5.5774 data: 0.0001 max mem: 71357 -[01:33:19.939621] Epoch: [3] [1260/6500] lr: 0.000013 closs: 0.7090 (0.7379) grad_norm: 0.3789 (0.4573) time: 5.5846 data: 0.0001 max mem: 71357 -[01:34:15.687899] Epoch: [3] [1270/6500] lr: 0.000013 closs: 0.6752 (0.7378) grad_norm: 0.3819 (0.4575) time: 5.5762 data: 0.0001 max mem: 71357 -[01:35:11.467944] Epoch: [3] [1280/6500] lr: 0.000012 closs: 0.7566 (0.7384) grad_norm: 0.3789 (0.4571) time: 5.5763 data: 0.0001 max mem: 71357 -[01:36:07.120143] Epoch: [3] [1290/6500] lr: 0.000012 closs: 0.8008 (0.7388) grad_norm: 0.3870 (0.4576) time: 5.5715 data: 0.0001 max mem: 71357 -[01:37:02.915605] Epoch: [3] [1300/6500] lr: 0.000012 closs: 0.7662 (0.7389) grad_norm: 0.4138 (0.4575) time: 5.5723 data: 0.0001 max mem: 71357 -[01:37:58.533136] Epoch: [3] [1310/6500] lr: 0.000012 closs: 0.7344 (0.7395) grad_norm: 0.4138 (0.4575) time: 5.5706 data: 0.0001 max mem: 71357 -[01:38:54.316913] Epoch: [3] [1320/6500] lr: 0.000012 closs: 0.7344 (0.7396) grad_norm: 0.4228 (0.4576) time: 5.5700 data: 0.0001 max mem: 71357 -[01:39:50.015096] Epoch: [3] [1330/6500] lr: 0.000012 closs: 0.7196 (0.7398) grad_norm: 0.4228 (0.4582) time: 5.5740 data: 0.0001 max mem: 71357 -[01:40:45.918757] Epoch: [3] [1340/6500] lr: 0.000012 closs: 0.7234 (0.7402) grad_norm: 0.4203 (0.4577) time: 5.5800 data: 0.0001 max mem: 71357 -[01:41:41.617278] Epoch: [3] [1350/6500] lr: 0.000012 closs: 0.7311 (0.7401) grad_norm: 0.4203 (0.4579) time: 5.5800 data: 0.0001 max mem: 71357 -[01:42:37.361572] Epoch: [3] [1360/6500] lr: 0.000012 closs: 0.6839 (0.7396) grad_norm: 0.4300 (0.4578) time: 5.5721 data: 0.0001 max mem: 71357 -[01:43:33.087776] Epoch: [3] [1370/6500] lr: 0.000012 closs: 0.7108 (0.7402) grad_norm: 0.4203 (0.4580) time: 5.5734 data: 0.0001 max mem: 71357 -[01:44:28.900787] Epoch: [3] [1380/6500] lr: 0.000012 closs: 0.8245 (0.7409) grad_norm: 0.4237 (0.4579) time: 5.5769 data: 0.0001 max mem: 71357 -[01:45:24.573697] Epoch: [3] [1390/6500] lr: 0.000012 closs: 0.7725 (0.7407) grad_norm: 0.4237 (0.4583) time: 5.5742 data: 0.0001 max mem: 71357 -[01:46:20.312427] Epoch: [3] [1400/6500] lr: 0.000012 closs: 0.7474 (0.7410) grad_norm: 0.4414 (0.4586) time: 5.5705 data: 0.0001 max mem: 71357 -[01:47:16.093990] Epoch: [3] [1410/6500] lr: 0.000012 closs: 0.7474 (0.7411) grad_norm: 0.4410 (0.4586) time: 5.5759 data: 0.0001 max mem: 71357 -[01:48:11.816746] Epoch: [3] [1420/6500] lr: 0.000012 closs: 0.7565 (0.7411) grad_norm: 0.4414 (0.4585) time: 5.5751 data: 0.0001 max mem: 71357 -[01:49:07.716266] Epoch: [3] [1430/6500] lr: 0.000012 closs: 0.7565 (0.7411) grad_norm: 0.4410 (0.4582) time: 5.5810 data: 0.0001 max mem: 71357 -[01:50:03.388723] Epoch: [3] [1440/6500] lr: 0.000012 closs: 0.7284 (0.7415) grad_norm: 0.4376 (0.4584) time: 5.5785 data: 0.0001 max mem: 71357 -[01:50:59.024566] Epoch: [3] [1450/6500] lr: 0.000012 closs: 0.7738 (0.7418) grad_norm: 0.4376 (0.4589) time: 5.5653 data: 0.0001 max mem: 71357 -[01:51:54.723070] Epoch: [3] [1460/6500] lr: 0.000012 closs: 0.7617 (0.7418) grad_norm: 0.4371 (0.4589) time: 5.5667 data: 0.0001 max mem: 71357 -[01:52:50.538357] Epoch: [3] [1470/6500] lr: 0.000012 closs: 0.6811 (0.7416) grad_norm: 0.4182 (0.4585) time: 5.5756 data: 0.0001 max mem: 71357 -[01:53:46.361051] Epoch: [3] [1480/6500] lr: 0.000012 closs: 0.7539 (0.7417) grad_norm: 0.4128 (0.4580) time: 5.5818 data: 0.0001 max mem: 71357 -[01:54:42.096154] Epoch: [3] [1490/6500] lr: 0.000012 closs: 0.7626 (0.7418) grad_norm: 0.3807 (0.4579) time: 5.5778 data: 0.0001 max mem: 71357 -[01:55:37.782445] Epoch: [3] [1500/6500] lr: 0.000012 closs: 0.7505 (0.7420) grad_norm: 0.3785 (0.4578) time: 5.5710 data: 0.0001 max mem: 71357 -[01:56:33.437642] Epoch: [3] [1510/6500] lr: 0.000012 closs: 0.7306 (0.7415) grad_norm: 0.3785 (0.4576) time: 5.5670 data: 0.0001 max mem: 71357 -[01:57:29.352279] Epoch: [3] [1520/6500] lr: 0.000012 closs: 0.6655 (0.7410) grad_norm: 0.3785 (0.4571) time: 5.5784 data: 0.0001 max mem: 71357 -[01:58:24.994944] Epoch: [3] [1530/6500] lr: 0.000012 closs: 0.6983 (0.7410) grad_norm: 0.4127 (0.4581) time: 5.5778 data: 0.0001 max mem: 71357 -[01:59:20.733197] Epoch: [3] [1540/6500] lr: 0.000012 closs: 0.6983 (0.7407) grad_norm: 0.4055 (0.4574) time: 5.5690 data: 0.0001 max mem: 71357 -[02:00:16.503116] Epoch: [3] [1550/6500] lr: 0.000012 closs: 0.7581 (0.7409) grad_norm: 0.3975 (0.4673) time: 5.5753 data: 0.0001 max mem: 71357 -[02:01:12.253133] Epoch: [3] [1560/6500] lr: 0.000012 closs: 0.7520 (0.7410) grad_norm: 0.4127 (0.4677) time: 5.5759 data: 0.0001 max mem: 71357 -[02:02:08.003585] Epoch: [3] [1570/6500] lr: 0.000012 closs: 0.7520 (0.7412) grad_norm: 0.3975 (0.4677) time: 5.5749 data: 0.0001 max mem: 71357 -[02:03:03.622228] Epoch: [3] [1580/6500] lr: 0.000012 closs: 0.7429 (0.7413) grad_norm: 0.4225 (0.4673) time: 5.5684 data: 0.0001 max mem: 71357 -[02:03:59.278490] Epoch: [3] [1590/6500] lr: 0.000012 closs: 0.6785 (0.7412) grad_norm: 0.4574 (0.4672) time: 5.5637 data: 0.0001 max mem: 71357 -[02:04:54.995154] Epoch: [3] [1600/6500] lr: 0.000012 closs: 0.7179 (0.7411) grad_norm: 0.4226 (0.4683) time: 5.5686 data: 0.0001 max mem: 71357 -[02:05:50.719879] Epoch: [3] [1610/6500] lr: 0.000012 closs: 0.7179 (0.7411) grad_norm: 0.4166 (0.4696) time: 5.5720 data: 0.0001 max mem: 71357 -[02:06:46.372165] Epoch: [3] [1620/6500] lr: 0.000012 closs: 0.7214 (0.7412) grad_norm: 0.4342 (0.4695) time: 5.5688 data: 0.0001 max mem: 71357 -[02:07:42.042038] Epoch: [3] [1630/6500] lr: 0.000012 closs: 0.7214 (0.7415) grad_norm: 0.4226 (0.4692) time: 5.5660 data: 0.0001 max mem: 71357 -[02:08:37.707675] Epoch: [3] [1640/6500] lr: 0.000012 closs: 0.6832 (0.7413) grad_norm: 0.4152 (0.4688) time: 5.5667 data: 0.0001 max mem: 71357 -[02:09:33.583277] Epoch: [3] [1650/6500] lr: 0.000012 closs: 0.7236 (0.7416) grad_norm: 0.3965 (0.4683) time: 5.5770 data: 0.0001 max mem: 71357 -[02:10:29.261319] Epoch: [3] [1660/6500] lr: 0.000012 closs: 0.7659 (0.7420) grad_norm: 0.3933 (0.4688) time: 5.5776 data: 0.0001 max mem: 71357 -[02:11:25.002648] Epoch: [3] [1670/6500] lr: 0.000011 closs: 0.7812 (0.7422) grad_norm: 0.3842 (0.4682) time: 5.5709 data: 0.0001 max mem: 71357 -[02:12:20.775138] Epoch: [3] [1680/6500] lr: 0.000011 closs: 0.7292 (0.7417) grad_norm: 0.3713 (0.4677) time: 5.5756 data: 0.0001 max mem: 71357 -[02:13:16.590435] Epoch: [3] [1690/6500] lr: 0.000011 closs: 0.7292 (0.7418) grad_norm: 0.3856 (0.4674) time: 5.5793 data: 0.0001 max mem: 71357 -[02:14:12.309570] Epoch: [3] [1700/6500] lr: 0.000011 closs: 0.7039 (0.7418) grad_norm: 0.3970 (0.4672) time: 5.5766 data: 0.0001 max mem: 71357 -[02:15:08.139498] Epoch: [3] [1710/6500] lr: 0.000011 closs: 0.6996 (0.7415) grad_norm: 0.4101 (0.4673) time: 5.5774 data: 0.0001 max mem: 71357 -[02:16:03.905752] Epoch: [3] [1720/6500] lr: 0.000011 closs: 0.7542 (0.7416) grad_norm: 0.4146 (0.4669) time: 5.5797 data: 0.0001 max mem: 71357 -[02:16:59.594667] Epoch: [3] [1730/6500] lr: 0.000011 closs: 0.7506 (0.7417) grad_norm: 0.4146 (0.4666) time: 5.5726 data: 0.0001 max mem: 71357 -[02:17:55.268246] Epoch: [3] [1740/6500] lr: 0.000011 closs: 0.6567 (0.7410) grad_norm: 0.4084 (0.4667) time: 5.5680 data: 0.0001 max mem: 71357 -[02:18:50.907554] Epoch: [3] [1750/6500] lr: 0.000011 closs: 0.6691 (0.7413) grad_norm: 0.4212 (0.4668) time: 5.5656 data: 0.0001 max mem: 71357 -[02:19:46.589279] Epoch: [3] [1760/6500] lr: 0.000011 closs: 0.6691 (0.7410) grad_norm: 0.4253 (0.4670) time: 5.5660 data: 0.0001 max mem: 71357 -[02:20:42.306504] Epoch: [3] [1770/6500] lr: 0.000011 closs: 0.6992 (0.7412) grad_norm: 0.4253 (0.4667) time: 5.5699 data: 0.0001 max mem: 71357 -[02:21:38.162516] Epoch: [3] [1780/6500] lr: 0.000011 closs: 0.7811 (0.7412) grad_norm: 0.4221 (0.4664) time: 5.5786 data: 0.0001 max mem: 71357 -[02:22:33.796796] Epoch: [3] [1790/6500] lr: 0.000011 closs: 0.8156 (0.7417) grad_norm: 0.4038 (0.4661) time: 5.5744 data: 0.0001 max mem: 71357 -[02:23:29.492190] Epoch: [3] [1800/6500] lr: 0.000011 closs: 0.8221 (0.7420) grad_norm: 0.4221 (0.4661) time: 5.5664 data: 0.0001 max mem: 71357 -[02:24:25.122788] Epoch: [3] [1810/6500] lr: 0.000011 closs: 0.7031 (0.7419) grad_norm: 0.4221 (0.4658) time: 5.5662 data: 0.0001 max mem: 71357 -[02:25:20.865166] Epoch: [3] [1820/6500] lr: 0.000011 closs: 0.7157 (0.7420) grad_norm: 0.4443 (0.4663) time: 5.5686 data: 0.0001 max mem: 71357 -[02:26:16.547329] Epoch: [3] [1830/6500] lr: 0.000011 closs: 0.7475 (0.7419) grad_norm: 0.4443 (0.4661) time: 5.5712 data: 0.0001 max mem: 71357 -[02:27:12.190688] Epoch: [3] [1840/6500] lr: 0.000011 closs: 0.7351 (0.7418) grad_norm: 0.4383 (0.4659) time: 5.5662 data: 0.0001 max mem: 71357 -[02:28:07.854923] Epoch: [3] [1850/6500] lr: 0.000011 closs: 0.7313 (0.7419) grad_norm: 0.4383 (0.4655) time: 5.5653 data: 0.0001 max mem: 71357 -[02:29:03.539524] Epoch: [3] [1860/6500] lr: 0.000011 closs: 0.7179 (0.7419) grad_norm: 0.3883 (0.4656) time: 5.5674 data: 0.0001 max mem: 71357 -[02:29:59.225284] Epoch: [3] [1870/6500] lr: 0.000011 closs: 0.7383 (0.7420) grad_norm: 0.4725 (0.4658) time: 5.5684 data: 0.0001 max mem: 71357 -[02:30:54.926822] Epoch: [3] [1880/6500] lr: 0.000011 closs: 0.7903 (0.7422) grad_norm: 0.4160 (0.4656) time: 5.5693 data: 0.0001 max mem: 71357 -[02:31:50.596193] Epoch: [3] [1890/6500] lr: 0.000011 closs: 0.7737 (0.7417) grad_norm: 0.4482 (0.4657) time: 5.5684 data: 0.0001 max mem: 71357 -[02:32:46.225288] Epoch: [3] [1900/6500] lr: 0.000011 closs: 0.7226 (0.7418) grad_norm: 0.4482 (0.4662) time: 5.5648 data: 0.0001 max mem: 71357 -[02:33:42.077006] Epoch: [3] [1910/6500] lr: 0.000011 closs: 0.7017 (0.7414) grad_norm: 0.4312 (0.4662) time: 5.5740 data: 0.0001 max mem: 71357 -[02:34:37.702178] Epoch: [3] [1920/6500] lr: 0.000011 closs: 0.6920 (0.7413) grad_norm: 0.4146 (0.4657) time: 5.5738 data: 0.0001 max mem: 71357 -[02:35:33.312850] Epoch: [3] [1930/6500] lr: 0.000011 closs: 0.7232 (0.7414) grad_norm: 0.4084 (0.4657) time: 5.5617 data: 0.0001 max mem: 71357 -[02:36:29.008470] Epoch: [3] [1940/6500] lr: 0.000011 closs: 0.7370 (0.7416) grad_norm: 0.3967 (0.4656) time: 5.5652 data: 0.0001 max mem: 71357 -[02:37:24.578544] Epoch: [3] [1950/6500] lr: 0.000011 closs: 0.6784 (0.7410) grad_norm: 0.4146 (0.4656) time: 5.5632 data: 0.0001 max mem: 71357 -[02:38:20.386332] Epoch: [3] [1960/6500] lr: 0.000011 closs: 0.7038 (0.7412) grad_norm: 0.4360 (0.4668) time: 5.5688 data: 0.0001 max mem: 71357 -[02:39:16.102321] Epoch: [3] [1970/6500] lr: 0.000011 closs: 0.7362 (0.7417) grad_norm: 0.4360 (0.4667) time: 5.5761 data: 0.0001 max mem: 71357 -[02:40:11.878609] Epoch: [3] [1980/6500] lr: 0.000011 closs: 0.7269 (0.7417) grad_norm: 0.4407 (0.4668) time: 5.5746 data: 0.0001 max mem: 71357 -[02:41:07.583458] Epoch: [3] [1990/6500] lr: 0.000011 closs: 0.7227 (0.7414) grad_norm: 0.3940 (0.4665) time: 5.5740 data: 0.0001 max mem: 71357 -[02:42:03.433265] Epoch: [3] [2000/6500] lr: 0.000011 closs: 0.6968 (0.7418) grad_norm: 0.3859 (0.4671) time: 5.5777 data: 0.0001 max mem: 71357 -[02:42:59.197575] Epoch: [3] [2010/6500] lr: 0.000011 closs: 0.7337 (0.7419) grad_norm: 0.3948 (0.4671) time: 5.5806 data: 0.0001 max mem: 71357 -[02:43:54.917992] Epoch: [3] [2020/6500] lr: 0.000011 closs: 0.7337 (0.7419) grad_norm: 0.3868 (0.4670) time: 5.5741 data: 0.0001 max mem: 71357 -[02:44:50.528619] Epoch: [3] [2030/6500] lr: 0.000011 closs: 0.7078 (0.7416) grad_norm: 0.3948 (0.4672) time: 5.5665 data: 0.0001 max mem: 71357 -[02:45:46.347257] Epoch: [3] [2040/6500] lr: 0.000011 closs: 0.7078 (0.7418) grad_norm: 0.3819 (0.4667) time: 5.5714 data: 0.0001 max mem: 71357 -[02:46:42.143262] Epoch: [3] [2050/6500] lr: 0.000011 closs: 0.8076 (0.7420) grad_norm: 0.3655 (0.4664) time: 5.5807 data: 0.0001 max mem: 71357 -[02:47:37.791827] Epoch: [3] [2060/6500] lr: 0.000011 closs: 0.7841 (0.7419) grad_norm: 0.3819 (0.4661) time: 5.5721 data: 0.0001 max mem: 71357 -[02:48:33.510323] Epoch: [3] [2070/6500] lr: 0.000010 closs: 0.7400 (0.7420) grad_norm: 0.3773 (0.4659) time: 5.5683 data: 0.0001 max mem: 71357 -[02:49:29.250770] Epoch: [3] [2080/6500] lr: 0.000010 closs: 0.7386 (0.7420) grad_norm: 0.3972 (0.4665) time: 5.5729 data: 0.0001 max mem: 71357 -[02:50:25.143434] Epoch: [3] [2090/6500] lr: 0.000010 closs: 0.7386 (0.7423) grad_norm: 0.4165 (0.4708) time: 5.5816 data: 0.0001 max mem: 71357 -[02:51:20.673668] Epoch: [3] [2100/6500] lr: 0.000010 closs: 0.7314 (0.7420) grad_norm: 0.5068 (0.4721) time: 5.5711 data: 0.0001 max mem: 71357 -[02:52:16.282560] Epoch: [3] [2110/6500] lr: 0.000010 closs: 0.7098 (0.7423) grad_norm: 0.5068 (0.4721) time: 5.5569 data: 0.0001 max mem: 71357 -[02:53:11.994560] Epoch: [3] [2120/6500] lr: 0.000010 closs: 0.7516 (0.7426) grad_norm: 0.4518 (0.4720) time: 5.5660 data: 0.0001 max mem: 71357 -[02:54:07.738349] Epoch: [3] [2130/6500] lr: 0.000010 closs: 0.7340 (0.7425) grad_norm: 0.4506 (0.4718) time: 5.5727 data: 0.0001 max mem: 71357 -[02:55:03.531949] Epoch: [3] [2140/6500] lr: 0.000010 closs: 0.6876 (0.7423) grad_norm: 0.4036 (0.4718) time: 5.5768 data: 0.0001 max mem: 71357 -[02:55:59.106827] Epoch: [3] [2150/6500] lr: 0.000010 closs: 0.6876 (0.7424) grad_norm: 0.4139 (0.4721) time: 5.5683 data: 0.0001 max mem: 71357 -[02:56:54.769431] Epoch: [3] [2160/6500] lr: 0.000010 closs: 0.7490 (0.7425) grad_norm: 0.4139 (0.4718) time: 5.5618 data: 0.0001 max mem: 71357 -[02:57:50.424404] Epoch: [3] [2170/6500] lr: 0.000010 closs: 0.6904 (0.7421) grad_norm: 0.4139 (0.4724) time: 5.5658 data: 0.0001 max mem: 71357 -[02:58:46.204630] Epoch: [3] [2180/6500] lr: 0.000010 closs: 0.7001 (0.7419) grad_norm: 0.4364 (0.4725) time: 5.5717 data: 0.0001 max mem: 71357 -[02:59:41.913462] Epoch: [3] [2190/6500] lr: 0.000010 closs: 0.7383 (0.7419) grad_norm: 0.4115 (0.4721) time: 5.5744 data: 0.0001 max mem: 71357 -[03:00:37.544665] Epoch: [3] [2200/6500] lr: 0.000010 closs: 0.7552 (0.7420) grad_norm: 0.3997 (0.4716) time: 5.5669 data: 0.0001 max mem: 71357 -[03:01:33.065049] Epoch: [3] [2210/6500] lr: 0.000010 closs: 0.7129 (0.7416) grad_norm: 0.4201 (0.4722) time: 5.5575 data: 0.0001 max mem: 71357 -[03:02:28.964229] Epoch: [3] [2220/6500] lr: 0.000010 closs: 0.7129 (0.7415) grad_norm: 0.3805 (0.4718) time: 5.5709 data: 0.0001 max mem: 71357 -[03:03:24.711773] Epoch: [3] [2230/6500] lr: 0.000010 closs: 0.7841 (0.7417) grad_norm: 0.4201 (0.4720) time: 5.5823 data: 0.0001 max mem: 71357 -[03:04:20.376127] Epoch: [3] [2240/6500] lr: 0.000010 closs: 0.7152 (0.7416) grad_norm: 0.4451 (0.4735) time: 5.5705 data: 0.0001 max mem: 71357 -[03:05:16.094408] Epoch: [3] [2250/6500] lr: 0.000010 closs: 0.7152 (0.7417) grad_norm: 0.3921 (0.4732) time: 5.5690 data: 0.0001 max mem: 71357 -[03:06:11.929158] Epoch: [3] [2260/6500] lr: 0.000010 closs: 0.7516 (0.7419) grad_norm: 0.4235 (0.4729) time: 5.5776 data: 0.0001 max mem: 71357 -[03:07:07.681225] Epoch: [3] [2270/6500] lr: 0.000010 closs: 0.7474 (0.7419) grad_norm: 0.4103 (0.4726) time: 5.5793 data: 0.0001 max mem: 71357 -[03:08:03.330780] Epoch: [3] [2280/6500] lr: 0.000010 closs: 0.7457 (0.7421) grad_norm: 0.4028 (0.4724) time: 5.5700 data: 0.0001 max mem: 71357 -[03:08:59.036717] Epoch: [3] [2290/6500] lr: 0.000010 closs: 0.7586 (0.7420) grad_norm: 0.4277 (0.4728) time: 5.5677 data: 0.0001 max mem: 71357 -[03:09:54.753564] Epoch: [3] [2300/6500] lr: 0.000010 closs: 0.7384 (0.7419) grad_norm: 0.4449 (0.4728) time: 5.5710 data: 0.0001 max mem: 71357 -[03:10:50.585338] Epoch: [3] [2310/6500] lr: 0.000010 closs: 0.6609 (0.7415) grad_norm: 0.4297 (0.4726) time: 5.5773 data: 0.0001 max mem: 71357 -[03:11:46.300428] Epoch: [3] [2320/6500] lr: 0.000010 closs: 0.6929 (0.7417) grad_norm: 0.4388 (0.4725) time: 5.5773 data: 0.0001 max mem: 71357 -[03:12:42.024279] Epoch: [3] [2330/6500] lr: 0.000010 closs: 0.7588 (0.7419) grad_norm: 0.4257 (0.4725) time: 5.5719 data: 0.0001 max mem: 71357 -[03:13:37.667847] Epoch: [3] [2340/6500] lr: 0.000010 closs: 0.7638 (0.7420) grad_norm: 0.4236 (0.4723) time: 5.5683 data: 0.0001 max mem: 71357 -[03:14:33.402863] Epoch: [3] [2350/6500] lr: 0.000010 closs: 0.7448 (0.7419) grad_norm: 0.4299 (0.4724) time: 5.5688 data: 0.0001 max mem: 71357 -[03:15:29.150906] Epoch: [3] [2360/6500] lr: 0.000010 closs: 0.6643 (0.7415) grad_norm: 0.4257 (0.4721) time: 5.5741 data: 0.0001 max mem: 71357 -[03:16:24.888869] Epoch: [3] [2370/6500] lr: 0.000010 closs: 0.6643 (0.7413) grad_norm: 0.3967 (0.4717) time: 5.5742 data: 0.0001 max mem: 71357 -[03:17:20.515563] Epoch: [3] [2380/6500] lr: 0.000010 closs: 0.6623 (0.7412) grad_norm: 0.4207 (0.4722) time: 5.5682 data: 0.0001 max mem: 71357 -[03:18:16.227993] Epoch: [3] [2390/6500] lr: 0.000010 closs: 0.7421 (0.7414) grad_norm: 0.4128 (0.4721) time: 5.5669 data: 0.0001 max mem: 71357 -[03:19:12.133308] Epoch: [3] [2400/6500] lr: 0.000010 closs: 0.6836 (0.7410) grad_norm: 0.3865 (0.4718) time: 5.5808 data: 0.0001 max mem: 71357 -[03:20:07.896174] Epoch: [3] [2410/6500] lr: 0.000010 closs: 0.6543 (0.7409) grad_norm: 0.4128 (0.4717) time: 5.5833 data: 0.0001 max mem: 71357 -[03:21:03.510971] Epoch: [3] [2420/6500] lr: 0.000010 closs: 0.7160 (0.7409) grad_norm: 0.4059 (0.4716) time: 5.5688 data: 0.0001 max mem: 71357 -[03:21:59.207139] Epoch: [3] [2430/6500] lr: 0.000010 closs: 0.7137 (0.7409) grad_norm: 0.4059 (0.4717) time: 5.5655 data: 0.0001 max mem: 71357 -[03:22:55.037062] Epoch: [3] [2440/6500] lr: 0.000010 closs: 0.6463 (0.7407) grad_norm: 0.4322 (0.4724) time: 5.5762 data: 0.0001 max mem: 71357 -[03:23:50.643702] Epoch: [3] [2450/6500] lr: 0.000010 closs: 0.6266 (0.7404) grad_norm: 0.4504 (0.4727) time: 5.5717 data: 0.0001 max mem: 71357 -[03:24:46.427959] Epoch: [3] [2460/6500] lr: 0.000010 closs: 0.6887 (0.7405) grad_norm: 0.4398 (0.4724) time: 5.5694 data: 0.0001 max mem: 71357 -[03:25:42.142791] Epoch: [3] [2470/6500] lr: 0.000010 closs: 0.7283 (0.7406) grad_norm: 0.4504 (0.4723) time: 5.5749 data: 0.0001 max mem: 71357 -[03:26:38.004975] Epoch: [3] [2480/6500] lr: 0.000010 closs: 0.6969 (0.7404) grad_norm: 0.4335 (0.4719) time: 5.5788 data: 0.0001 max mem: 71357 -[03:27:33.744560] Epoch: [3] [2490/6500] lr: 0.000010 closs: 0.6872 (0.7404) grad_norm: 0.3993 (0.4719) time: 5.5800 data: 0.0001 max mem: 71357 -[03:28:29.410295] Epoch: [3] [2500/6500] lr: 0.000010 closs: 0.6872 (0.7403) grad_norm: 0.4124 (0.4718) time: 5.5702 data: 0.0001 max mem: 71357 -[03:29:25.128655] Epoch: [3] [2510/6500] lr: 0.000009 closs: 0.7039 (0.7404) grad_norm: 0.3874 (0.4718) time: 5.5691 data: 0.0001 max mem: 71357 -[03:30:20.824465] Epoch: [3] [2520/6500] lr: 0.000009 closs: 0.7661 (0.7407) grad_norm: 0.3943 (0.4717) time: 5.5706 data: 0.0001 max mem: 71357 -[03:31:16.630889] Epoch: [3] [2530/6500] lr: 0.000009 closs: 0.7887 (0.7411) grad_norm: 0.3874 (0.4713) time: 5.5750 data: 0.0001 max mem: 71357 -[03:32:12.221908] Epoch: [3] [2540/6500] lr: 0.000009 closs: 0.7857 (0.7410) grad_norm: 0.3943 (0.4717) time: 5.5698 data: 0.0001 max mem: 71357 -[03:33:07.870496] Epoch: [3] [2550/6500] lr: 0.000009 closs: 0.7204 (0.7411) grad_norm: 0.4102 (0.4718) time: 5.5619 data: 0.0001 max mem: 71357 -[03:34:03.554231] Epoch: [3] [2560/6500] lr: 0.000009 closs: 0.7516 (0.7415) grad_norm: 0.4711 (0.4719) time: 5.5666 data: 0.0001 max mem: 71357 -[03:34:59.248273] Epoch: [3] [2570/6500] lr: 0.000009 closs: 0.7628 (0.7414) grad_norm: 0.4896 (0.4722) time: 5.5688 data: 0.0001 max mem: 71357 -[03:35:54.952768] Epoch: [3] [2580/6500] lr: 0.000009 closs: 0.6777 (0.7414) grad_norm: 0.4469 (0.4719) time: 5.5698 data: 0.0001 max mem: 71357 -[03:36:50.647324] Epoch: [3] [2590/6500] lr: 0.000009 closs: 0.7527 (0.7414) grad_norm: 0.4279 (0.4720) time: 5.5699 data: 0.0001 max mem: 71357 -[03:37:46.354833] Epoch: [3] [2600/6500] lr: 0.000009 closs: 0.7692 (0.7415) grad_norm: 0.3699 (0.4717) time: 5.5700 data: 0.0001 max mem: 71357 -[03:38:41.964847] Epoch: [3] [2610/6500] lr: 0.000009 closs: 0.7800 (0.7416) grad_norm: 0.3615 (0.4716) time: 5.5658 data: 0.0001 max mem: 71357 -[03:39:37.792299] Epoch: [3] [2620/6500] lr: 0.000009 closs: 0.6826 (0.7413) grad_norm: 0.4102 (0.4716) time: 5.5718 data: 0.0001 max mem: 71357 -[03:40:33.423699] Epoch: [3] [2630/6500] lr: 0.000009 closs: 0.7121 (0.7411) grad_norm: 0.3668 (0.4712) time: 5.5729 data: 0.0001 max mem: 71357 -[03:41:29.121161] Epoch: [3] [2640/6500] lr: 0.000009 closs: 0.7084 (0.7409) grad_norm: 0.4153 (0.4710) time: 5.5663 data: 0.0001 max mem: 71357 -[03:42:24.835125] Epoch: [3] [2650/6500] lr: 0.000009 closs: 0.7084 (0.7410) grad_norm: 0.3918 (0.4706) time: 5.5705 data: 0.0001 max mem: 71357 -[03:43:20.618505] Epoch: [3] [2660/6500] lr: 0.000009 closs: 0.7455 (0.7410) grad_norm: 0.3730 (0.4704) time: 5.5748 data: 0.0001 max mem: 71357 -[03:44:16.323482] Epoch: [3] [2670/6500] lr: 0.000009 closs: 0.7791 (0.7412) grad_norm: 0.3868 (0.4702) time: 5.5743 data: 0.0001 max mem: 71357 -[03:45:12.012818] Epoch: [3] [2680/6500] lr: 0.000009 closs: 0.7665 (0.7411) grad_norm: 0.3774 (0.4701) time: 5.5696 data: 0.0001 max mem: 71357 -[03:46:07.663209] Epoch: [3] [2690/6500] lr: 0.000009 closs: 0.6634 (0.7410) grad_norm: 0.4084 (0.4700) time: 5.5669 data: 0.0001 max mem: 71357 -[03:47:03.385219] Epoch: [3] [2700/6500] lr: 0.000009 closs: 0.7106 (0.7409) grad_norm: 0.4270 (0.4699) time: 5.5685 data: 0.0001 max mem: 71357 -[03:47:59.094951] Epoch: [3] [2710/6500] lr: 0.000009 closs: 0.6913 (0.7409) grad_norm: 0.4162 (0.4697) time: 5.5715 data: 0.0001 max mem: 71357 -[03:48:54.689454] Epoch: [3] [2720/6500] lr: 0.000009 closs: 0.6913 (0.7408) grad_norm: 0.4328 (0.4699) time: 5.5651 data: 0.0001 max mem: 71357 -[03:49:50.392630] Epoch: [3] [2730/6500] lr: 0.000009 closs: 0.7583 (0.7411) grad_norm: 0.4328 (0.4698) time: 5.5648 data: 0.0001 max mem: 71357 -[03:50:46.064539] Epoch: [3] [2740/6500] lr: 0.000009 closs: 0.7172 (0.7408) grad_norm: 0.4328 (0.4697) time: 5.5687 data: 0.0001 max mem: 71357 -[03:51:41.898318] Epoch: [3] [2750/6500] lr: 0.000009 closs: 0.7137 (0.7409) grad_norm: 0.4655 (0.4696) time: 5.5752 data: 0.0001 max mem: 71357 -[03:52:37.622614] Epoch: [3] [2760/6500] lr: 0.000009 closs: 0.7675 (0.7411) grad_norm: 0.4293 (0.4695) time: 5.5778 data: 0.0001 max mem: 71357 -[03:53:33.270434] Epoch: [3] [2770/6500] lr: 0.000009 closs: 0.7621 (0.7410) grad_norm: 0.4293 (0.4694) time: 5.5685 data: 0.0001 max mem: 71357 -[03:54:28.938310] Epoch: [3] [2780/6500] lr: 0.000009 closs: 0.7231 (0.7410) grad_norm: 0.4198 (0.4695) time: 5.5657 data: 0.0001 max mem: 71357 -[03:55:24.636676] Epoch: [3] [2790/6500] lr: 0.000009 closs: 0.7205 (0.7410) grad_norm: 0.4177 (0.4694) time: 5.5683 data: 0.0001 max mem: 71357 -[03:56:20.341616] Epoch: [3] [2800/6500] lr: 0.000009 closs: 0.7450 (0.7414) grad_norm: 0.4198 (0.4702) time: 5.5701 data: 0.0001 max mem: 71357 -[03:57:16.007696] Epoch: [3] [2810/6500] lr: 0.000009 closs: 0.7234 (0.7413) grad_norm: 0.4308 (0.4709) time: 5.5685 data: 0.0001 max mem: 71357 -[03:58:11.641875] Epoch: [3] [2820/6500] lr: 0.000009 closs: 0.6968 (0.7413) grad_norm: 0.4058 (0.4706) time: 5.5649 data: 0.0001 max mem: 71357 -[03:59:07.419506] Epoch: [3] [2830/6500] lr: 0.000009 closs: 0.7425 (0.7414) grad_norm: 0.4037 (0.4706) time: 5.5705 data: 0.0001 max mem: 71357 -[04:00:03.264856] Epoch: [3] [2840/6500] lr: 0.000009 closs: 0.7550 (0.7416) grad_norm: 0.4037 (0.4703) time: 5.5811 data: 0.0001 max mem: 71357 -[04:00:58.906002] Epoch: [3] [2850/6500] lr: 0.000009 closs: 0.7550 (0.7417) grad_norm: 0.3923 (0.4703) time: 5.5743 data: 0.0001 max mem: 71357 -[04:01:54.598558] Epoch: [3] [2860/6500] lr: 0.000009 closs: 0.7611 (0.7416) grad_norm: 0.4231 (0.4702) time: 5.5666 data: 0.0001 max mem: 71357 -[04:02:50.245016] Epoch: [3] [2870/6500] lr: 0.000009 closs: 0.7677 (0.7419) grad_norm: 0.4231 (0.4703) time: 5.5669 data: 0.0001 max mem: 71357 -[04:03:45.974594] Epoch: [3] [2880/6500] lr: 0.000009 closs: 0.7250 (0.7417) grad_norm: 0.4318 (0.4703) time: 5.5687 data: 0.0001 max mem: 71357 -[04:04:41.643615] Epoch: [3] [2890/6500] lr: 0.000009 closs: 0.7242 (0.7419) grad_norm: 0.4211 (0.4705) time: 5.5698 data: 0.0001 max mem: 71357 -[04:05:37.230252] Epoch: [3] [2900/6500] lr: 0.000009 closs: 0.7474 (0.7420) grad_norm: 0.4166 (0.4704) time: 5.5627 data: 0.0001 max mem: 71357 -[04:06:32.951079] Epoch: [3] [2910/6500] lr: 0.000009 closs: 0.7044 (0.7418) grad_norm: 0.4166 (0.4703) time: 5.5653 data: 0.0001 max mem: 71357 -[04:07:28.763809] Epoch: [3] [2920/6500] lr: 0.000009 closs: 0.6616 (0.7417) grad_norm: 0.4041 (0.4698) time: 5.5766 data: 0.0001 max mem: 71357 -[04:08:24.491781] Epoch: [3] [2930/6500] lr: 0.000009 closs: 0.6237 (0.7413) grad_norm: 0.4075 (0.4698) time: 5.5770 data: 0.0001 max mem: 71357 -[04:09:20.165615] Epoch: [3] [2940/6500] lr: 0.000009 closs: 0.6831 (0.7411) grad_norm: 0.4126 (0.4697) time: 5.5700 data: 0.0001 max mem: 71357 -[04:10:15.890629] Epoch: [3] [2950/6500] lr: 0.000009 closs: 0.6921 (0.7410) grad_norm: 0.4271 (0.4699) time: 5.5699 data: 0.0001 max mem: 71357 -[04:11:11.666969] Epoch: [3] [2960/6500] lr: 0.000009 closs: 0.6946 (0.7409) grad_norm: 0.4273 (0.4696) time: 5.5750 data: 0.0001 max mem: 71357 -[04:12:07.462625] Epoch: [3] [2970/6500] lr: 0.000009 closs: 0.6784 (0.7407) grad_norm: 0.3911 (0.4695) time: 5.5785 data: 0.0001 max mem: 71357 -[04:13:03.238329] Epoch: [3] [2980/6500] lr: 0.000009 closs: 0.7087 (0.7407) grad_norm: 0.3911 (0.4693) time: 5.5785 data: 0.0001 max mem: 71357 -[04:13:58.891238] Epoch: [3] [2990/6500] lr: 0.000009 closs: 0.7087 (0.7404) grad_norm: 0.3911 (0.4692) time: 5.5713 data: 0.0001 max mem: 71357 -[04:14:54.493479] Epoch: [3] [3000/6500] lr: 0.000008 closs: 0.6673 (0.7403) grad_norm: 0.4436 (0.4696) time: 5.5627 data: 0.0001 max mem: 71357 -[04:15:50.321151] Epoch: [3] [3010/6500] lr: 0.000008 closs: 0.7433 (0.7404) grad_norm: 0.4436 (0.4694) time: 5.5714 data: 0.0001 max mem: 71357 -[04:16:46.107372] Epoch: [3] [3020/6500] lr: 0.000008 closs: 0.7592 (0.7406) grad_norm: 0.4476 (0.4693) time: 5.5806 data: 0.0001 max mem: 71357 -[04:17:41.774052] Epoch: [3] [3030/6500] lr: 0.000008 closs: 0.7118 (0.7405) grad_norm: 0.4055 (0.4690) time: 5.5725 data: 0.0001 max mem: 71357 -[04:18:37.357139] Epoch: [3] [3040/6500] lr: 0.000008 closs: 0.7045 (0.7403) grad_norm: 0.3980 (0.4691) time: 5.5624 data: 0.0001 max mem: 71357 -[04:19:33.179800] Epoch: [3] [3050/6500] lr: 0.000008 closs: 0.7116 (0.7405) grad_norm: 0.3708 (0.4687) time: 5.5702 data: 0.0001 max mem: 71357 -[04:20:29.004476] Epoch: [3] [3060/6500] lr: 0.000008 closs: 0.7054 (0.7404) grad_norm: 0.3656 (0.4685) time: 5.5823 data: 0.0001 max mem: 71357 -[04:21:24.716825] Epoch: [3] [3070/6500] lr: 0.000008 closs: 0.7146 (0.7405) grad_norm: 0.3758 (0.4684) time: 5.5768 data: 0.0001 max mem: 71357 -[04:22:20.360022] Epoch: [3] [3080/6500] lr: 0.000008 closs: 0.7148 (0.7403) grad_norm: 0.3667 (0.4683) time: 5.5677 data: 0.0001 max mem: 71357 -[04:23:16.035938] Epoch: [3] [3090/6500] lr: 0.000008 closs: 0.7148 (0.7403) grad_norm: 0.3956 (0.4683) time: 5.5659 data: 0.0001 max mem: 71357 -[04:24:11.851079] Epoch: [3] [3100/6500] lr: 0.000008 closs: 0.6789 (0.7400) grad_norm: 0.4137 (0.4680) time: 5.5745 data: 0.0001 max mem: 71357 -[04:25:07.496748] Epoch: [3] [3110/6500] lr: 0.000008 closs: 0.6925 (0.7402) grad_norm: 0.3955 (0.4684) time: 5.5730 data: 0.0001 max mem: 71357 -[04:26:03.184320] Epoch: [3] [3120/6500] lr: 0.000008 closs: 0.7409 (0.7402) grad_norm: 0.3955 (0.4682) time: 5.5666 data: 0.0001 max mem: 71357 -[04:26:58.826254] Epoch: [3] [3130/6500] lr: 0.000008 closs: 0.7133 (0.7402) grad_norm: 0.4034 (0.4684) time: 5.5664 data: 0.0001 max mem: 71357 -[04:27:54.567322] Epoch: [3] [3140/6500] lr: 0.000008 closs: 0.6893 (0.7399) grad_norm: 0.4219 (0.4689) time: 5.5691 data: 0.0001 max mem: 71357 -[04:28:50.349706] Epoch: [3] [3150/6500] lr: 0.000008 closs: 0.6893 (0.7402) grad_norm: 0.4405 (0.4689) time: 5.5761 data: 0.0001 max mem: 71357 -[04:29:46.023632] Epoch: [3] [3160/6500] lr: 0.000008 closs: 0.7973 (0.7404) grad_norm: 0.4484 (0.4690) time: 5.5727 data: 0.0001 max mem: 71357 -[04:30:41.687580] Epoch: [3] [3170/6500] lr: 0.000008 closs: 0.7909 (0.7406) grad_norm: 0.4616 (0.4695) time: 5.5668 data: 0.0001 max mem: 71357 -[04:31:37.298065] Epoch: [3] [3180/6500] lr: 0.000008 closs: 0.7909 (0.7408) grad_norm: 0.4616 (0.4695) time: 5.5636 data: 0.0001 max mem: 71357 -[04:32:33.030157] Epoch: [3] [3190/6500] lr: 0.000008 closs: 0.8091 (0.7411) grad_norm: 0.4743 (0.4695) time: 5.5670 data: 0.0001 max mem: 71357 -[04:33:28.658546] Epoch: [3] [3200/6500] lr: 0.000008 closs: 0.7844 (0.7411) grad_norm: 0.4590 (0.4698) time: 5.5679 data: 0.0001 max mem: 71357 -[04:34:24.435166] Epoch: [3] [3210/6500] lr: 0.000008 closs: 0.7599 (0.7413) grad_norm: 0.4360 (0.4700) time: 5.5702 data: 0.0001 max mem: 71357 -[04:35:20.170339] Epoch: [3] [3220/6500] lr: 0.000008 closs: 0.7599 (0.7411) grad_norm: 0.4270 (0.4699) time: 5.5755 data: 0.0001 max mem: 71357 -[04:36:15.966734] Epoch: [3] [3230/6500] lr: 0.000008 closs: 0.6817 (0.7410) grad_norm: 0.4210 (0.4698) time: 5.5765 data: 0.0001 max mem: 71357 -[04:37:11.668373] Epoch: [3] [3240/6500] lr: 0.000008 closs: 0.6817 (0.7409) grad_norm: 0.4270 (0.4698) time: 5.5748 data: 0.0001 max mem: 71357 -[04:38:07.436973] Epoch: [3] [3250/6500] lr: 0.000008 closs: 0.7496 (0.7410) grad_norm: 0.4443 (0.4702) time: 5.5734 data: 0.0001 max mem: 71357 -[04:39:03.208428] Epoch: [3] [3260/6500] lr: 0.000008 closs: 0.6809 (0.7408) grad_norm: 0.4437 (0.4700) time: 5.5769 data: 0.0001 max mem: 71357 -[04:39:58.909951] Epoch: [3] [3270/6500] lr: 0.000008 closs: 0.6809 (0.7408) grad_norm: 0.4473 (0.4700) time: 5.5735 data: 0.0001 max mem: 71357 -[04:40:54.726343] Epoch: [3] [3280/6500] lr: 0.000008 closs: 0.7186 (0.7407) grad_norm: 0.4473 (0.4701) time: 5.5758 data: 0.0001 max mem: 71357 -[04:41:50.415725] Epoch: [3] [3290/6500] lr: 0.000008 closs: 0.7186 (0.7407) grad_norm: 0.4437 (0.4699) time: 5.5752 data: 0.0001 max mem: 71357 -[04:42:46.123758] Epoch: [3] [3300/6500] lr: 0.000008 closs: 0.7253 (0.7407) grad_norm: 0.4245 (0.4697) time: 5.5698 data: 0.0001 max mem: 71357 -[04:43:41.840664] Epoch: [3] [3310/6500] lr: 0.000008 closs: 0.6994 (0.7406) grad_norm: 0.4150 (0.4696) time: 5.5712 data: 0.0001 max mem: 71357 -[04:44:37.594666] Epoch: [3] [3320/6500] lr: 0.000008 closs: 0.7145 (0.7406) grad_norm: 0.4150 (0.4695) time: 5.5735 data: 0.0001 max mem: 71357 -[04:45:33.240389] Epoch: [3] [3330/6500] lr: 0.000008 closs: 0.7095 (0.7405) grad_norm: 0.4161 (0.4696) time: 5.5699 data: 0.0001 max mem: 71357 -[04:46:28.959317] Epoch: [3] [3340/6500] lr: 0.000008 closs: 0.7035 (0.7405) grad_norm: 0.4373 (0.4700) time: 5.5681 data: 0.0001 max mem: 71357 -[04:47:24.606653] Epoch: [3] [3350/6500] lr: 0.000008 closs: 0.7035 (0.7404) grad_norm: 0.4373 (0.4700) time: 5.5682 data: 0.0001 max mem: 71357 -[04:48:20.364255] Epoch: [3] [3360/6500] lr: 0.000008 closs: 0.7093 (0.7403) grad_norm: 0.4540 (0.4699) time: 5.5702 data: 0.0001 max mem: 71357 -[04:49:16.119586] Epoch: [3] [3370/6500] lr: 0.000008 closs: 0.7780 (0.7404) grad_norm: 0.4645 (0.4699) time: 5.5756 data: 0.0001 max mem: 71357 -[04:50:11.776158] Epoch: [3] [3380/6500] lr: 0.000008 closs: 0.8124 (0.7406) grad_norm: 0.4137 (0.4697) time: 5.5705 data: 0.0001 max mem: 71357 -[04:51:07.474741] Epoch: [3] [3390/6500] lr: 0.000008 closs: 0.7139 (0.7405) grad_norm: 0.4485 (0.4700) time: 5.5677 data: 0.0001 max mem: 71357 -[04:52:03.267438] Epoch: [3] [3400/6500] lr: 0.000008 closs: 0.6892 (0.7405) grad_norm: 0.3771 (0.4697) time: 5.5745 data: 0.0001 max mem: 71357 -[04:52:59.023647] Epoch: [3] [3410/6500] lr: 0.000008 closs: 0.7276 (0.7407) grad_norm: 0.3916 (0.4701) time: 5.5774 data: 0.0001 max mem: 71357 -[04:53:54.706683] Epoch: [3] [3420/6500] lr: 0.000008 closs: 0.6954 (0.7408) grad_norm: 0.3916 (0.4698) time: 5.5719 data: 0.0001 max mem: 71357 -[04:54:50.377473] Epoch: [3] [3430/6500] lr: 0.000008 closs: 0.6899 (0.7407) grad_norm: 0.4018 (0.4699) time: 5.5676 data: 0.0001 max mem: 71357 -[04:55:46.036905] Epoch: [3] [3440/6500] lr: 0.000008 closs: 0.7141 (0.7407) grad_norm: 0.4018 (0.4698) time: 5.5664 data: 0.0001 max mem: 71357 -[04:56:41.749545] Epoch: [3] [3450/6500] lr: 0.000008 closs: 0.7220 (0.7406) grad_norm: 0.3857 (0.4696) time: 5.5685 data: 0.0001 max mem: 71357 -[04:57:37.559081] Epoch: [3] [3460/6500] lr: 0.000008 closs: 0.7657 (0.7406) grad_norm: 0.3844 (0.4695) time: 5.5760 data: 0.0001 max mem: 71357 -[04:58:33.212538] Epoch: [3] [3470/6500] lr: 0.000008 closs: 0.7399 (0.7406) grad_norm: 0.3841 (0.4693) time: 5.5731 data: 0.0001 max mem: 71357 -[04:59:28.895834] Epoch: [3] [3480/6500] lr: 0.000008 closs: 0.7152 (0.7408) grad_norm: 0.4047 (0.4693) time: 5.5668 data: 0.0001 max mem: 71357 -[05:00:24.526761] Epoch: [3] [3490/6500] lr: 0.000008 closs: 0.7117 (0.7407) grad_norm: 0.4259 (0.4694) time: 5.5656 data: 0.0001 max mem: 71357 -[05:01:20.288024] Epoch: [3] [3500/6500] lr: 0.000008 closs: 0.7111 (0.7405) grad_norm: 0.4359 (0.4695) time: 5.5695 data: 0.0001 max mem: 71357 -[05:02:15.993095] Epoch: [3] [3510/6500] lr: 0.000008 closs: 0.6038 (0.7403) grad_norm: 0.4570 (0.4696) time: 5.5733 data: 0.0001 max mem: 71357 -[05:03:11.767699] Epoch: [3] [3520/6500] lr: 0.000008 closs: 0.6809 (0.7404) grad_norm: 0.4570 (0.4696) time: 5.5739 data: 0.0001 max mem: 71357 -[05:04:07.414649] Epoch: [3] [3530/6500] lr: 0.000008 closs: 0.7217 (0.7403) grad_norm: 0.4787 (0.4703) time: 5.5710 data: 0.0001 max mem: 71357 -[05:05:03.283087] Epoch: [3] [3540/6500] lr: 0.000008 closs: 0.7217 (0.7402) grad_norm: 0.3929 (0.4701) time: 5.5757 data: 0.0001 max mem: 71357 -[05:05:58.925727] Epoch: [3] [3550/6500] lr: 0.000007 closs: 0.7480 (0.7403) grad_norm: 0.3891 (0.4698) time: 5.5754 data: 0.0001 max mem: 71357 -[05:06:54.610701] Epoch: [3] [3560/6500] lr: 0.000007 closs: 0.7596 (0.7405) grad_norm: 0.3929 (0.4698) time: 5.5663 data: 0.0001 max mem: 71357 -[05:07:50.323204] Epoch: [3] [3570/6500] lr: 0.000007 closs: 0.7339 (0.7403) grad_norm: 0.3710 (0.4695) time: 5.5698 data: 0.0001 max mem: 71357 -[05:08:46.047153] Epoch: [3] [3580/6500] lr: 0.000007 closs: 0.6691 (0.7402) grad_norm: 0.3913 (0.4694) time: 5.5718 data: 0.0001 max mem: 71357 -[05:09:41.796100] Epoch: [3] [3590/6500] lr: 0.000007 closs: 0.7296 (0.7402) grad_norm: 0.4097 (0.4695) time: 5.5736 data: 0.0001 max mem: 71357 -[05:10:37.416644] Epoch: [3] [3600/6500] lr: 0.000007 closs: 0.7296 (0.7400) grad_norm: 0.4042 (0.4696) time: 5.5684 data: 0.0001 max mem: 71357 -[05:11:33.195633] Epoch: [3] [3610/6500] lr: 0.000007 closs: 0.6830 (0.7399) grad_norm: 0.4097 (0.4693) time: 5.5699 data: 0.0001 max mem: 71357 -[05:12:28.863592] Epoch: [3] [3620/6500] lr: 0.000007 closs: 0.7189 (0.7399) grad_norm: 0.4191 (0.4693) time: 5.5723 data: 0.0001 max mem: 71357 -[05:13:24.628791] Epoch: [3] [3630/6500] lr: 0.000007 closs: 0.6991 (0.7398) grad_norm: 0.4191 (0.4692) time: 5.5716 data: 0.0001 max mem: 71357 -[05:14:20.229395] Epoch: [3] [3640/6500] lr: 0.000007 closs: 0.7327 (0.7402) grad_norm: 0.3995 (0.4690) time: 5.5682 data: 0.0001 max mem: 71357 -[05:15:15.944815] Epoch: [3] [3650/6500] lr: 0.000007 closs: 0.7718 (0.7402) grad_norm: 0.4191 (0.4692) time: 5.5657 data: 0.0001 max mem: 71357 -[05:16:11.642121] Epoch: [3] [3660/6500] lr: 0.000007 closs: 0.7550 (0.7403) grad_norm: 0.4011 (0.4695) time: 5.5705 data: 0.0001 max mem: 71357 -[05:17:07.479650] Epoch: [3] [3670/6500] lr: 0.000007 closs: 0.7673 (0.7402) grad_norm: 0.3984 (0.4693) time: 5.5767 data: 0.0001 max mem: 71357 -[05:18:03.202417] Epoch: [3] [3680/6500] lr: 0.000007 closs: 0.7821 (0.7403) grad_norm: 0.3894 (0.4694) time: 5.5780 data: 0.0001 max mem: 71357 -[05:18:58.921587] Epoch: [3] [3690/6500] lr: 0.000007 closs: 0.7572 (0.7405) grad_norm: 0.3894 (0.4693) time: 5.5720 data: 0.0001 max mem: 71357 -[05:19:54.690134] Epoch: [3] [3700/6500] lr: 0.000007 closs: 0.7508 (0.7406) grad_norm: 0.4002 (0.4692) time: 5.5743 data: 0.0001 max mem: 71357 -[05:20:50.328799] Epoch: [3] [3710/6500] lr: 0.000007 closs: 0.6726 (0.7405) grad_norm: 0.4002 (0.4689) time: 5.5703 data: 0.0001 max mem: 71357 -[05:21:46.081446] Epoch: [3] [3720/6500] lr: 0.000007 closs: 0.6688 (0.7405) grad_norm: 0.4002 (0.4689) time: 5.5695 data: 0.0001 max mem: 71357 -[05:22:41.792831] Epoch: [3] [3730/6500] lr: 0.000007 closs: 0.7399 (0.7405) grad_norm: 0.4459 (0.4693) time: 5.5731 data: 0.0001 max mem: 71357 -[05:23:37.391272] Epoch: [3] [3740/6500] lr: 0.000007 closs: 0.7180 (0.7404) grad_norm: 0.4610 (0.4693) time: 5.5654 data: 0.0001 max mem: 71357 -[05:24:33.095997] Epoch: [3] [3750/6500] lr: 0.000007 closs: 0.6927 (0.7403) grad_norm: 0.4757 (0.4694) time: 5.5651 data: 0.0001 max mem: 71357 -[05:25:28.837747] Epoch: [3] [3760/6500] lr: 0.000007 closs: 0.7164 (0.7405) grad_norm: 0.4757 (0.4696) time: 5.5723 data: 0.0001 max mem: 71357 -[05:26:24.545897] Epoch: [3] [3770/6500] lr: 0.000007 closs: 0.7164 (0.7405) grad_norm: 0.4634 (0.4695) time: 5.5724 data: 0.0001 max mem: 71357 -[05:27:20.234683] Epoch: [3] [3780/6500] lr: 0.000007 closs: 0.7184 (0.7405) grad_norm: 0.4144 (0.4693) time: 5.5698 data: 0.0001 max mem: 71357 -[05:28:15.998002] Epoch: [3] [3790/6500] lr: 0.000007 closs: 0.7094 (0.7405) grad_norm: 0.3937 (0.4692) time: 5.5725 data: 0.0001 max mem: 71357 -[05:29:11.732433] Epoch: [3] [3800/6500] lr: 0.000007 closs: 0.6876 (0.7406) grad_norm: 0.3814 (0.4693) time: 5.5748 data: 0.0001 max mem: 71357 -[05:30:07.575713] Epoch: [3] [3810/6500] lr: 0.000007 closs: 0.7399 (0.7407) grad_norm: 0.3745 (0.4689) time: 5.5788 data: 0.0001 max mem: 71357 -[05:31:03.224368] Epoch: [3] [3820/6500] lr: 0.000007 closs: 0.7399 (0.7408) grad_norm: 0.3791 (0.4691) time: 5.5745 data: 0.0001 max mem: 71357 -[05:31:58.833849] Epoch: [3] [3830/6500] lr: 0.000007 closs: 0.7631 (0.7406) grad_norm: 0.4356 (0.4691) time: 5.5628 data: 0.0001 max mem: 71357 -[05:32:54.496305] Epoch: [3] [3840/6500] lr: 0.000007 closs: 0.6846 (0.7406) grad_norm: 0.4207 (0.4695) time: 5.5635 data: 0.0001 max mem: 71357 -[05:33:50.347789] Epoch: [3] [3850/6500] lr: 0.000007 closs: 0.7558 (0.7407) grad_norm: 0.5182 (0.4699) time: 5.5756 data: 0.0001 max mem: 71357 -[05:34:46.039279] Epoch: [3] [3860/6500] lr: 0.000007 closs: 0.7782 (0.7407) grad_norm: 0.4689 (0.4698) time: 5.5771 data: 0.0001 max mem: 71357 -[05:35:41.780086] Epoch: [3] [3870/6500] lr: 0.000007 closs: 0.7601 (0.7408) grad_norm: 0.4636 (0.4706) time: 5.5716 data: 0.0001 max mem: 71357 -[05:36:37.492954] Epoch: [3] [3880/6500] lr: 0.000007 closs: 0.7601 (0.7409) grad_norm: 0.4093 (0.4704) time: 5.5726 data: 0.0001 max mem: 71357 -[05:37:33.218084] Epoch: [3] [3890/6500] lr: 0.000007 closs: 0.7292 (0.7407) grad_norm: 0.3931 (0.4702) time: 5.5718 data: 0.0001 max mem: 71357 -[05:38:29.017484] Epoch: [3] [3900/6500] lr: 0.000007 closs: 0.7349 (0.7407) grad_norm: 0.4056 (0.4702) time: 5.5761 data: 0.0001 max mem: 71357 -[05:39:24.708152] Epoch: [3] [3910/6500] lr: 0.000007 closs: 0.8072 (0.7410) grad_norm: 0.3931 (0.4700) time: 5.5744 data: 0.0001 max mem: 71357 -[05:40:20.454098] Epoch: [3] [3920/6500] lr: 0.000007 closs: 0.8075 (0.7412) grad_norm: 0.3901 (0.4698) time: 5.5717 data: 0.0001 max mem: 71357 -[05:41:16.072069] Epoch: [3] [3930/6500] lr: 0.000007 closs: 0.7397 (0.7413) grad_norm: 0.3941 (0.4698) time: 5.5681 data: 0.0001 max mem: 71357 -[05:42:11.874456] Epoch: [3] [3940/6500] lr: 0.000007 closs: 0.7319 (0.7412) grad_norm: 0.3901 (0.4697) time: 5.5710 data: 0.0001 max mem: 71357 -[05:43:07.571147] Epoch: [3] [3950/6500] lr: 0.000007 closs: 0.7194 (0.7411) grad_norm: 0.4072 (0.4699) time: 5.5749 data: 0.0001 max mem: 71357 -[05:44:03.269451] Epoch: [3] [3960/6500] lr: 0.000007 closs: 0.6988 (0.7410) grad_norm: 0.4316 (0.4697) time: 5.5697 data: 0.0001 max mem: 71357 -[05:44:58.911292] Epoch: [3] [3970/6500] lr: 0.000007 closs: 0.6769 (0.7411) grad_norm: 0.4400 (0.4698) time: 5.5669 data: 0.0001 max mem: 71357 -[05:45:54.762157] Epoch: [3] [3980/6500] lr: 0.000007 closs: 0.6727 (0.7409) grad_norm: 0.4316 (0.4695) time: 5.5746 data: 0.0001 max mem: 71357 -[05:46:50.420700] Epoch: [3] [3990/6500] lr: 0.000007 closs: 0.7395 (0.7412) grad_norm: 0.4151 (0.4694) time: 5.5754 data: 0.0001 max mem: 71357 -[05:47:46.149461] Epoch: [3] [4000/6500] lr: 0.000007 closs: 0.7470 (0.7409) grad_norm: 0.4080 (0.4692) time: 5.5693 data: 0.0001 max mem: 71357 -[05:48:41.905023] Epoch: [3] [4010/6500] lr: 0.000007 closs: 0.6684 (0.7408) grad_norm: 0.3969 (0.4693) time: 5.5741 data: 0.0001 max mem: 71357 -[05:49:37.663723] Epoch: [3] [4020/6500] lr: 0.000007 closs: 0.7380 (0.7409) grad_norm: 0.4290 (0.4692) time: 5.5756 data: 0.0001 max mem: 71357 -[05:50:33.353764] Epoch: [3] [4030/6500] lr: 0.000007 closs: 0.7211 (0.7407) grad_norm: 0.4290 (0.4699) time: 5.5724 data: 0.0001 max mem: 71357 -[05:51:29.001387] Epoch: [3] [4040/6500] lr: 0.000007 closs: 0.7211 (0.7408) grad_norm: 0.4670 (0.4702) time: 5.5668 data: 0.0001 max mem: 71357 -[05:52:24.657192] Epoch: [3] [4050/6500] lr: 0.000007 closs: 0.7172 (0.7407) grad_norm: 0.4290 (0.4704) time: 5.5651 data: 0.0001 max mem: 71357 -[05:53:20.370617] Epoch: [3] [4060/6500] lr: 0.000007 closs: 0.6402 (0.7406) grad_norm: 0.4262 (0.4703) time: 5.5684 data: 0.0001 max mem: 71357 -[05:54:16.087743] Epoch: [3] [4070/6500] lr: 0.000007 closs: 0.6991 (0.7406) grad_norm: 0.4298 (0.4702) time: 5.5714 data: 0.0001 max mem: 71357 -[05:55:11.720085] Epoch: [3] [4080/6500] lr: 0.000007 closs: 0.7579 (0.7408) grad_norm: 0.4249 (0.4702) time: 5.5674 data: 0.0001 max mem: 71357 -[05:56:07.403518] Epoch: [3] [4090/6500] lr: 0.000007 closs: 0.7559 (0.7408) grad_norm: 0.4282 (0.4702) time: 5.5657 data: 0.0001 max mem: 71357 -[05:57:03.082301] Epoch: [3] [4100/6500] lr: 0.000007 closs: 0.6847 (0.7406) grad_norm: 0.3763 (0.4701) time: 5.5680 data: 0.0001 max mem: 71357 -[05:57:58.773781] Epoch: [3] [4110/6500] lr: 0.000007 closs: 0.6676 (0.7406) grad_norm: 0.3763 (0.4703) time: 5.5684 data: 0.0001 max mem: 71357 -[05:58:54.385862] Epoch: [3] [4120/6500] lr: 0.000007 closs: 0.7262 (0.7404) grad_norm: 0.4959 (0.4706) time: 5.5651 data: 0.0001 max mem: 71357 -[05:59:50.104040] Epoch: [3] [4130/6500] lr: 0.000007 closs: 0.7262 (0.7403) grad_norm: 0.4959 (0.4705) time: 5.5664 data: 0.0001 max mem: 71357 -[06:00:45.692846] Epoch: [3] [4140/6500] lr: 0.000007 closs: 0.7231 (0.7404) grad_norm: 0.5343 (0.4706) time: 5.5653 data: 0.0001 max mem: 71357 -[06:01:41.355915] Epoch: [3] [4150/6500] lr: 0.000007 closs: 0.7379 (0.7404) grad_norm: 0.5177 (0.4707) time: 5.5625 data: 0.0001 max mem: 71357 -[06:02:37.089968] Epoch: [3] [4160/6500] lr: 0.000007 closs: 0.7380 (0.7404) grad_norm: 0.4467 (0.4707) time: 5.5698 data: 0.0001 max mem: 71357 -[06:03:32.774291] Epoch: [3] [4170/6500] lr: 0.000007 closs: 0.7332 (0.7404) grad_norm: 0.5048 (0.4711) time: 5.5708 data: 0.0001 max mem: 71357 -[06:04:28.497632] Epoch: [3] [4180/6500] lr: 0.000007 closs: 0.7578 (0.7405) grad_norm: 0.4494 (0.4712) time: 5.5703 data: 0.0001 max mem: 71357 -[06:05:24.216613] Epoch: [3] [4190/6500] lr: 0.000007 closs: 0.7601 (0.7405) grad_norm: 0.4129 (0.4710) time: 5.5720 data: 0.0001 max mem: 71357 -[06:06:20.098227] Epoch: [3] [4200/6500] lr: 0.000007 closs: 0.7665 (0.7406) grad_norm: 0.3923 (0.4710) time: 5.5800 data: 0.0001 max mem: 71357 -[06:07:15.828052] Epoch: [3] [4210/6500] lr: 0.000007 closs: 0.7841 (0.7408) grad_norm: 0.3622 (0.4709) time: 5.5805 data: 0.0001 max mem: 71357 -[06:08:11.532027] Epoch: [3] [4220/6500] lr: 0.000007 closs: 0.7194 (0.7406) grad_norm: 0.3691 (0.4706) time: 5.5716 data: 0.0001 max mem: 71357 -[06:09:07.164298] Epoch: [3] [4230/6500] lr: 0.000006 closs: 0.6699 (0.7404) grad_norm: 0.3812 (0.4724) time: 5.5667 data: 0.0001 max mem: 71357 -[06:10:02.992629] Epoch: [3] [4240/6500] lr: 0.000006 closs: 0.6629 (0.7404) grad_norm: 0.4220 (0.4724) time: 5.5729 data: 0.0001 max mem: 71357 -[06:10:58.745878] Epoch: [3] [4250/6500] lr: 0.000006 closs: 0.7612 (0.7405) grad_norm: 0.3945 (0.4722) time: 5.5790 data: 0.0001 max mem: 71357 -[06:11:54.440900] Epoch: [3] [4260/6500] lr: 0.000006 closs: 0.7949 (0.7407) grad_norm: 0.4455 (0.4722) time: 5.5724 data: 0.0001 max mem: 71357 -[06:12:50.182793] Epoch: [3] [4270/6500] lr: 0.000006 closs: 0.8130 (0.7407) grad_norm: 0.4455 (0.4722) time: 5.5718 data: 0.0001 max mem: 71357 -[06:13:45.770945] Epoch: [3] [4280/6500] lr: 0.000006 closs: 0.6939 (0.7405) grad_norm: 0.4629 (0.4730) time: 5.5664 data: 0.0001 max mem: 71357 -[06:14:41.714399] Epoch: [3] [4290/6500] lr: 0.000006 closs: 0.7175 (0.7407) grad_norm: 0.4635 (0.4731) time: 5.5765 data: 0.0001 max mem: 71357 -[06:15:37.371600] Epoch: [3] [4300/6500] lr: 0.000006 closs: 0.8015 (0.7408) grad_norm: 0.4426 (0.4728) time: 5.5799 data: 0.0001 max mem: 71357 -[06:16:33.108011] Epoch: [3] [4310/6500] lr: 0.000006 closs: 0.7606 (0.7409) grad_norm: 0.4413 (0.4732) time: 5.5696 data: 0.0001 max mem: 71357 -[06:17:28.770795] Epoch: [3] [4320/6500] lr: 0.000006 closs: 0.7072 (0.7408) grad_norm: 0.3953 (0.4731) time: 5.5699 data: 0.0001 max mem: 71357 -[06:18:24.617908] Epoch: [3] [4330/6500] lr: 0.000006 closs: 0.7142 (0.7409) grad_norm: 0.3763 (0.4730) time: 5.5754 data: 0.0001 max mem: 71357 -[06:19:20.284502] Epoch: [3] [4340/6500] lr: 0.000006 closs: 0.7488 (0.7408) grad_norm: 0.4262 (0.4732) time: 5.5756 data: 0.0001 max mem: 71357 -[06:20:16.015270] Epoch: [3] [4350/6500] lr: 0.000006 closs: 0.6938 (0.7408) grad_norm: 0.4168 (0.4731) time: 5.5698 data: 0.0001 max mem: 71357 -[06:21:11.725133] Epoch: [3] [4360/6500] lr: 0.000006 closs: 0.7256 (0.7410) grad_norm: 0.4232 (0.4732) time: 5.5720 data: 0.0001 max mem: 71357 -[06:22:07.350207] Epoch: [3] [4370/6500] lr: 0.000006 closs: 0.7400 (0.7410) grad_norm: 0.4398 (0.4732) time: 5.5667 data: 0.0001 max mem: 71357 -[06:23:03.236264] Epoch: [3] [4380/6500] lr: 0.000006 closs: 0.7400 (0.7410) grad_norm: 0.4398 (0.4735) time: 5.5755 data: 0.0001 max mem: 71357 -[06:23:58.889034] Epoch: [3] [4390/6500] lr: 0.000006 closs: 0.7240 (0.7409) grad_norm: 0.4513 (0.4736) time: 5.5769 data: 0.0001 max mem: 71357 -[06:24:54.533006] Epoch: [3] [4400/6500] lr: 0.000006 closs: 0.7240 (0.7408) grad_norm: 0.4513 (0.4737) time: 5.5647 data: 0.0001 max mem: 71357 -[06:25:50.206903] Epoch: [3] [4410/6500] lr: 0.000006 closs: 0.7493 (0.7411) grad_norm: 0.4513 (0.4737) time: 5.5658 data: 0.0001 max mem: 71357 -[06:26:45.995600] Epoch: [3] [4420/6500] lr: 0.000006 closs: 0.7180 (0.7411) grad_norm: 0.4341 (0.4776) time: 5.5730 data: 0.0001 max mem: 71357 -[06:27:41.697454] Epoch: [3] [4430/6500] lr: 0.000006 closs: 0.6501 (0.7409) grad_norm: 0.4669 (0.4775) time: 5.5745 data: 0.0001 max mem: 71357 -[06:28:37.418266] Epoch: [3] [4440/6500] lr: 0.000006 closs: 0.6607 (0.7407) grad_norm: 0.4669 (0.4775) time: 5.5711 data: 0.0001 max mem: 71357 -[06:29:33.035519] Epoch: [3] [4450/6500] lr: 0.000006 closs: 0.6990 (0.7406) grad_norm: 0.4688 (0.4777) time: 5.5668 data: 0.0001 max mem: 71357 -[06:30:28.813181] Epoch: [3] [4460/6500] lr: 0.000006 closs: 0.7062 (0.7406) grad_norm: 0.4669 (0.4776) time: 5.5697 data: 0.0001 max mem: 71357 -[06:31:24.518637] Epoch: [3] [4470/6500] lr: 0.000006 closs: 0.8034 (0.7408) grad_norm: 0.4726 (0.4779) time: 5.5741 data: 0.0001 max mem: 71357 -[06:32:20.176534] Epoch: [3] [4480/6500] lr: 0.000006 closs: 0.7781 (0.7407) grad_norm: 0.4556 (0.4779) time: 5.5681 data: 0.0001 max mem: 71357 -[06:33:15.934576] Epoch: [3] [4490/6500] lr: 0.000006 closs: 0.7453 (0.7408) grad_norm: 0.4051 (0.4776) time: 5.5707 data: 0.0001 max mem: 71357 -[06:34:11.633237] Epoch: [3] [4500/6500] lr: 0.000006 closs: 0.7150 (0.7406) grad_norm: 0.4099 (0.4775) time: 5.5728 data: 0.0001 max mem: 71357 -[06:35:07.409947] Epoch: [3] [4510/6500] lr: 0.000006 closs: 0.6874 (0.7405) grad_norm: 0.4041 (0.4775) time: 5.5737 data: 0.0001 max mem: 71357 -[06:36:03.057657] Epoch: [3] [4520/6500] lr: 0.000006 closs: 0.7244 (0.7407) grad_norm: 0.4633 (0.4778) time: 5.5711 data: 0.0001 max mem: 71357 -[06:36:58.751863] Epoch: [3] [4530/6500] lr: 0.000006 closs: 0.7346 (0.7407) grad_norm: 0.4917 (0.4777) time: 5.5670 data: 0.0001 max mem: 71357 -[06:37:54.453452] Epoch: [3] [4540/6500] lr: 0.000006 closs: 0.7567 (0.7407) grad_norm: 0.4633 (0.4776) time: 5.5697 data: 0.0001 max mem: 71357 -[06:38:50.277539] Epoch: [3] [4550/6500] lr: 0.000006 closs: 0.7567 (0.7407) grad_norm: 0.4308 (0.4778) time: 5.5762 data: 0.0001 max mem: 71357 -[06:39:45.982251] Epoch: [3] [4560/6500] lr: 0.000006 closs: 0.7651 (0.7409) grad_norm: 0.4175 (0.4777) time: 5.5764 data: 0.0001 max mem: 71357 -[06:40:41.651886] Epoch: [3] [4570/6500] lr: 0.000006 closs: 0.7811 (0.7410) grad_norm: 0.3881 (0.4776) time: 5.5686 data: 0.0001 max mem: 71357 -[06:41:37.315108] Epoch: [3] [4580/6500] lr: 0.000006 closs: 0.7358 (0.7410) grad_norm: 0.3934 (0.4776) time: 5.5666 data: 0.0001 max mem: 71357 -[06:42:33.052296] Epoch: [3] [4590/6500] lr: 0.000006 closs: 0.7234 (0.7410) grad_norm: 0.3934 (0.4775) time: 5.5700 data: 0.0001 max mem: 71357 -[06:43:28.866940] Epoch: [3] [4600/6500] lr: 0.000006 closs: 0.7615 (0.7411) grad_norm: 0.4202 (0.4775) time: 5.5775 data: 0.0001 max mem: 71357 -[06:44:24.515048] Epoch: [3] [4610/6500] lr: 0.000006 closs: 0.6976 (0.7410) grad_norm: 0.4237 (0.4774) time: 5.5731 data: 0.0001 max mem: 71357 -[06:45:20.203725] Epoch: [3] [4620/6500] lr: 0.000006 closs: 0.6506 (0.7410) grad_norm: 0.4171 (0.4772) time: 5.5668 data: 0.0001 max mem: 71357 -[06:46:15.879177] Epoch: [3] [4630/6500] lr: 0.000006 closs: 0.7314 (0.7410) grad_norm: 0.4004 (0.4769) time: 5.5681 data: 0.0001 max mem: 71357 -[06:47:11.707137] Epoch: [3] [4640/6500] lr: 0.000006 closs: 0.7390 (0.7410) grad_norm: 0.3716 (0.4768) time: 5.5751 data: 0.0001 max mem: 71357 -[06:48:07.323873] Epoch: [3] [4650/6500] lr: 0.000006 closs: 0.7443 (0.7409) grad_norm: 0.3923 (0.4769) time: 5.5722 data: 0.0001 max mem: 71357 -[06:49:03.064604] Epoch: [3] [4660/6500] lr: 0.000006 closs: 0.7443 (0.7409) grad_norm: 0.3923 (0.4767) time: 5.5678 data: 0.0001 max mem: 71357 -[06:49:58.767683] Epoch: [3] [4670/6500] lr: 0.000006 closs: 0.7634 (0.7410) grad_norm: 0.3923 (0.4766) time: 5.5721 data: 0.0001 max mem: 71357 -[06:50:54.566372] Epoch: [3] [4680/6500] lr: 0.000006 closs: 0.7191 (0.7408) grad_norm: 0.3871 (0.4765) time: 5.5750 data: 0.0001 max mem: 71357 -[06:51:50.323458] Epoch: [3] [4690/6500] lr: 0.000006 closs: 0.7153 (0.7407) grad_norm: 0.3820 (0.4766) time: 5.5777 data: 0.0001 max mem: 71357 -[06:52:46.085085] Epoch: [3] [4700/6500] lr: 0.000006 closs: 0.6925 (0.7407) grad_norm: 0.3504 (0.4763) time: 5.5759 data: 0.0001 max mem: 71357 -[06:53:41.820424] Epoch: [3] [4710/6500] lr: 0.000006 closs: 0.6925 (0.7406) grad_norm: 0.3788 (0.4764) time: 5.5747 data: 0.0001 max mem: 71357 -[06:54:37.430020] Epoch: [3] [4720/6500] lr: 0.000006 closs: 0.7173 (0.7405) grad_norm: 0.3988 (0.4764) time: 5.5671 data: 0.0001 max mem: 71357 -[06:55:33.170954] Epoch: [3] [4730/6500] lr: 0.000006 closs: 0.7568 (0.7407) grad_norm: 0.4271 (0.4763) time: 5.5674 data: 0.0001 max mem: 71357 -[06:56:28.817636] Epoch: [3] [4740/6500] lr: 0.000006 closs: 0.7464 (0.7408) grad_norm: 0.4350 (0.4761) time: 5.5693 data: 0.0001 max mem: 71357 -[06:57:24.442554] Epoch: [3] [4750/6500] lr: 0.000006 closs: 0.7297 (0.7408) grad_norm: 0.4253 (0.4760) time: 5.5635 data: 0.0001 max mem: 71357 -[06:58:20.075994] Epoch: [3] [4760/6500] lr: 0.000006 closs: 0.7300 (0.7408) grad_norm: 0.4253 (0.4768) time: 5.5628 data: 0.0001 max mem: 71357 -[06:59:15.879354] Epoch: [3] [4770/6500] lr: 0.000006 closs: 0.6813 (0.7408) grad_norm: 0.4014 (0.4768) time: 5.5718 data: 0.0001 max mem: 71357 -[07:00:11.589236] Epoch: [3] [4780/6500] lr: 0.000006 closs: 0.6824 (0.7409) grad_norm: 0.4178 (0.4767) time: 5.5756 data: 0.0001 max mem: 71357 -[07:01:07.310329] Epoch: [3] [4790/6500] lr: 0.000006 closs: 0.7243 (0.7409) grad_norm: 0.4163 (0.4766) time: 5.5714 data: 0.0001 max mem: 71357 -[07:02:03.036933] Epoch: [3] [4800/6500] lr: 0.000006 closs: 0.7869 (0.7410) grad_norm: 0.4163 (0.4766) time: 5.5723 data: 0.0001 max mem: 71357 -[07:02:58.635514] Epoch: [3] [4810/6500] lr: 0.000006 closs: 0.7832 (0.7410) grad_norm: 0.4575 (0.4767) time: 5.5662 data: 0.0001 max mem: 71357 -[07:03:54.439261] Epoch: [3] [4820/6500] lr: 0.000006 closs: 0.7549 (0.7410) grad_norm: 0.4702 (0.4769) time: 5.5701 data: 0.0001 max mem: 71357 -[07:04:50.068310] Epoch: [3] [4830/6500] lr: 0.000006 closs: 0.7156 (0.7410) grad_norm: 0.4575 (0.4768) time: 5.5716 data: 0.0001 max mem: 71357 -[07:05:45.752970] Epoch: [3] [4840/6500] lr: 0.000006 closs: 0.7375 (0.7409) grad_norm: 0.4076 (0.4766) time: 5.5656 data: 0.0001 max mem: 71357 -[07:06:41.332388] Epoch: [3] [4850/6500] lr: 0.000006 closs: 0.7409 (0.7410) grad_norm: 0.4074 (0.4767) time: 5.5631 data: 0.0001 max mem: 71357 -[07:07:37.187318] Epoch: [3] [4860/6500] lr: 0.000006 closs: 0.6977 (0.7409) grad_norm: 0.4074 (0.4767) time: 5.5716 data: 0.0001 max mem: 71357 -[07:08:32.915254] Epoch: [3] [4870/6500] lr: 0.000006 closs: 0.6836 (0.7408) grad_norm: 0.4279 (0.4767) time: 5.5790 data: 0.0001 max mem: 71357 -[07:09:28.532950] Epoch: [3] [4880/6500] lr: 0.000006 closs: 0.7120 (0.7409) grad_norm: 0.4074 (0.4766) time: 5.5672 data: 0.0001 max mem: 71357 -[07:10:24.255320] Epoch: [3] [4890/6500] lr: 0.000006 closs: 0.7367 (0.7407) grad_norm: 0.4095 (0.4767) time: 5.5669 data: 0.0001 max mem: 71357 -[07:11:20.031107] Epoch: [3] [4900/6500] lr: 0.000006 closs: 0.7259 (0.7407) grad_norm: 0.4011 (0.4766) time: 5.5748 data: 0.0001 max mem: 71357 -[07:12:15.812216] Epoch: [3] [4910/6500] lr: 0.000006 closs: 0.7271 (0.7408) grad_norm: 0.4011 (0.4765) time: 5.5777 data: 0.0001 max mem: 71357 -[07:13:11.520623] Epoch: [3] [4920/6500] lr: 0.000006 closs: 0.7102 (0.7408) grad_norm: 0.4011 (0.4763) time: 5.5744 data: 0.0001 max mem: 71357 -[07:14:07.160205] Epoch: [3] [4930/6500] lr: 0.000006 closs: 0.7200 (0.7408) grad_norm: 0.3937 (0.4763) time: 5.5673 data: 0.0001 max mem: 71357 -[07:15:02.721256] Epoch: [3] [4940/6500] lr: 0.000006 closs: 0.7026 (0.7407) grad_norm: 0.4128 (0.4766) time: 5.5599 data: 0.0001 max mem: 71357 -[07:15:58.537548] Epoch: [3] [4950/6500] lr: 0.000006 closs: 0.6585 (0.7407) grad_norm: 0.4183 (0.4768) time: 5.5688 data: 0.0001 max mem: 71357 -[07:16:54.210667] Epoch: [3] [4960/6500] lr: 0.000006 closs: 0.6897 (0.7407) grad_norm: 0.4334 (0.4766) time: 5.5744 data: 0.0001 max mem: 71357 -[07:17:49.890698] Epoch: [3] [4970/6500] lr: 0.000006 closs: 0.7234 (0.7405) grad_norm: 0.4334 (0.4766) time: 5.5676 data: 0.0001 max mem: 71357 -[07:18:45.562111] Epoch: [3] [4980/6500] lr: 0.000006 closs: 0.7334 (0.7404) grad_norm: 0.3593 (0.4764) time: 5.5675 data: 0.0001 max mem: 71357 -[07:19:41.353861] Epoch: [3] [4990/6500] lr: 0.000006 closs: 0.6968 (0.7404) grad_norm: 0.3642 (0.4765) time: 5.5731 data: 0.0001 max mem: 71357 -[07:20:37.076513] Epoch: [3] [5000/6500] lr: 0.000006 closs: 0.6968 (0.7405) grad_norm: 0.4023 (0.4765) time: 5.5757 data: 0.0001 max mem: 71357 -[07:21:32.711715] Epoch: [3] [5010/6500] lr: 0.000006 closs: 0.7888 (0.7405) grad_norm: 0.4166 (0.4765) time: 5.5678 data: 0.0001 max mem: 71357 -[07:22:28.392680] Epoch: [3] [5020/6500] lr: 0.000006 closs: 0.8213 (0.7407) grad_norm: 0.4392 (0.4764) time: 5.5657 data: 0.0001 max mem: 71357 -[07:23:24.053200] Epoch: [3] [5030/6500] lr: 0.000006 closs: 0.8213 (0.7408) grad_norm: 0.4247 (0.4763) time: 5.5670 data: 0.0001 max mem: 71357 -[07:24:19.885504] Epoch: [3] [5040/6500] lr: 0.000006 closs: 0.7937 (0.7409) grad_norm: 0.4176 (0.4762) time: 5.5745 data: 0.0001 max mem: 71357 -[07:25:15.541561] Epoch: [3] [5050/6500] lr: 0.000006 closs: 0.7597 (0.7410) grad_norm: 0.4224 (0.4763) time: 5.5743 data: 0.0001 max mem: 71357 -[07:26:11.374618] Epoch: [3] [5060/6500] lr: 0.000006 closs: 0.6788 (0.7409) grad_norm: 0.4224 (0.4761) time: 5.5744 data: 0.0001 max mem: 71357 -[07:27:07.078997] Epoch: [3] [5070/6500] lr: 0.000006 closs: 0.7492 (0.7409) grad_norm: 0.4120 (0.4759) time: 5.5768 data: 0.0001 max mem: 71357 -[07:28:02.865357] Epoch: [3] [5080/6500] lr: 0.000006 closs: 0.7492 (0.7410) grad_norm: 0.4302 (0.4762) time: 5.5744 data: 0.0001 max mem: 71357 -[07:28:58.502309] Epoch: [3] [5090/6500] lr: 0.000006 closs: 0.7938 (0.7412) grad_norm: 0.4302 (0.4761) time: 5.5711 data: 0.0001 max mem: 71357 -[07:29:54.168306] Epoch: [3] [5100/6500] lr: 0.000006 closs: 0.7356 (0.7412) grad_norm: 0.4987 (0.4762) time: 5.5650 data: 0.0001 max mem: 71357 -[07:30:49.827035] Epoch: [3] [5110/6500] lr: 0.000006 closs: 0.7003 (0.7412) grad_norm: 0.4987 (0.4761) time: 5.5661 data: 0.0001 max mem: 71357 -[07:31:45.641823] Epoch: [3] [5120/6500] lr: 0.000006 closs: 0.7438 (0.7412) grad_norm: 0.4244 (0.4760) time: 5.5736 data: 0.0001 max mem: 71357 -[07:32:41.455089] Epoch: [3] [5130/6500] lr: 0.000006 closs: 0.7517 (0.7413) grad_norm: 0.4237 (0.4758) time: 5.5813 data: 0.0001 max mem: 71357 -[07:33:37.179398] Epoch: [3] [5140/6500] lr: 0.000006 closs: 0.7771 (0.7413) grad_norm: 0.4015 (0.4757) time: 5.5768 data: 0.0001 max mem: 71357 -[07:34:32.952469] Epoch: [3] [5150/6500] lr: 0.000006 closs: 0.6882 (0.7412) grad_norm: 0.3916 (0.4756) time: 5.5748 data: 0.0001 max mem: 71357 -[07:35:28.690763] Epoch: [3] [5160/6500] lr: 0.000006 closs: 0.7365 (0.7415) grad_norm: 0.4015 (0.4758) time: 5.5755 data: 0.0001 max mem: 71357 -[07:36:24.470225] Epoch: [3] [5170/6500] lr: 0.000006 closs: 0.7372 (0.7414) grad_norm: 0.4123 (0.4757) time: 5.5758 data: 0.0001 max mem: 71357 -[07:37:20.151884] Epoch: [3] [5180/6500] lr: 0.000006 closs: 0.7399 (0.7416) grad_norm: 0.4265 (0.4759) time: 5.5730 data: 0.0001 max mem: 71357 -[07:38:15.804126] Epoch: [3] [5190/6500] lr: 0.000005 closs: 0.7578 (0.7416) grad_norm: 0.4265 (0.4757) time: 5.5666 data: 0.0001 max mem: 71357 -[07:39:11.488282] Epoch: [3] [5200/6500] lr: 0.000005 closs: 0.6937 (0.7416) grad_norm: 0.4066 (0.4756) time: 5.5667 data: 0.0001 max mem: 71357 -[07:40:07.267455] Epoch: [3] [5210/6500] lr: 0.000005 closs: 0.7611 (0.7417) grad_norm: 0.3831 (0.4756) time: 5.5731 data: 0.0001 max mem: 71357 -[07:41:03.069520] Epoch: [3] [5220/6500] lr: 0.000005 closs: 0.7594 (0.7417) grad_norm: 0.3831 (0.4756) time: 5.5790 data: 0.0001 max mem: 71357 -[07:41:58.738721] Epoch: [3] [5230/6500] lr: 0.000005 closs: 0.6946 (0.7416) grad_norm: 0.4384 (0.4756) time: 5.5735 data: 0.0001 max mem: 71357 -[07:42:54.394666] Epoch: [3] [5240/6500] lr: 0.000005 closs: 0.5834 (0.7414) grad_norm: 0.4770 (0.4757) time: 5.5662 data: 0.0001 max mem: 71357 -[07:43:50.008917] Epoch: [3] [5250/6500] lr: 0.000005 closs: 0.6269 (0.7414) grad_norm: 0.4798 (0.4757) time: 5.5634 data: 0.0001 max mem: 71357 -[07:44:45.778422] Epoch: [3] [5260/6500] lr: 0.000005 closs: 0.7557 (0.7414) grad_norm: 0.4838 (0.4758) time: 5.5691 data: 0.0001 max mem: 71357 -[07:45:41.527673] Epoch: [3] [5270/6500] lr: 0.000005 closs: 0.7557 (0.7415) grad_norm: 0.5086 (0.4760) time: 5.5759 data: 0.0001 max mem: 71357 -[07:46:37.191214] Epoch: [3] [5280/6500] lr: 0.000005 closs: 0.7615 (0.7416) grad_norm: 0.4950 (0.4759) time: 5.5706 data: 0.0001 max mem: 71357 -[07:47:32.847414] Epoch: [3] [5290/6500] lr: 0.000005 closs: 0.7668 (0.7417) grad_norm: 0.4947 (0.4758) time: 5.5659 data: 0.0001 max mem: 71357 -[07:48:28.717965] Epoch: [3] [5300/6500] lr: 0.000005 closs: 0.7729 (0.7417) grad_norm: 0.4736 (0.4759) time: 5.5763 data: 0.0001 max mem: 71357 -[07:49:24.422934] Epoch: [3] [5310/6500] lr: 0.000005 closs: 0.7493 (0.7416) grad_norm: 0.4380 (0.4759) time: 5.5787 data: 0.0001 max mem: 71357 -[07:50:20.140110] Epoch: [3] [5320/6500] lr: 0.000005 closs: 0.7367 (0.7417) grad_norm: 0.4736 (0.4760) time: 5.5710 data: 0.0001 max mem: 71357 -[07:51:15.914706] Epoch: [3] [5330/6500] lr: 0.000005 closs: 0.7469 (0.7418) grad_norm: 0.4288 (0.4758) time: 5.5745 data: 0.0001 max mem: 71357 -[07:52:11.772834] Epoch: [3] [5340/6500] lr: 0.000005 closs: 0.7469 (0.7417) grad_norm: 0.3868 (0.4758) time: 5.5816 data: 0.0001 max mem: 71357 -[07:53:07.443542] Epoch: [3] [5350/6500] lr: 0.000005 closs: 0.7176 (0.7417) grad_norm: 0.3952 (0.4757) time: 5.5763 data: 0.0001 max mem: 71357 -[07:54:03.117070] Epoch: [3] [5360/6500] lr: 0.000005 closs: 0.7500 (0.7417) grad_norm: 0.3854 (0.4756) time: 5.5671 data: 0.0001 max mem: 71357 -[07:54:58.869745] Epoch: [3] [5370/6500] lr: 0.000005 closs: 0.7227 (0.7417) grad_norm: 0.3854 (0.4756) time: 5.5712 data: 0.0001 max mem: 71357 -[07:55:54.574182] Epoch: [3] [5380/6500] lr: 0.000005 closs: 0.7051 (0.7416) grad_norm: 0.3838 (0.4755) time: 5.5728 data: 0.0001 max mem: 71357 -[07:56:50.331719] Epoch: [3] [5390/6500] lr: 0.000005 closs: 0.7780 (0.7417) grad_norm: 0.4164 (0.4756) time: 5.5730 data: 0.0001 max mem: 71357 -[07:57:45.936766] Epoch: [3] [5400/6500] lr: 0.000005 closs: 0.7780 (0.7418) grad_norm: 0.4019 (0.4754) time: 5.5681 data: 0.0001 max mem: 71357 -[07:58:41.625507] Epoch: [3] [5410/6500] lr: 0.000005 closs: 0.7760 (0.7419) grad_norm: 0.4019 (0.4754) time: 5.5646 data: 0.0001 max mem: 71357 -[07:59:37.221626] Epoch: [3] [5420/6500] lr: 0.000005 closs: 0.7735 (0.7419) grad_norm: 0.4019 (0.4753) time: 5.5641 data: 0.0001 max mem: 71357 -[08:00:32.942836] Epoch: [3] [5430/6500] lr: 0.000005 closs: 0.7487 (0.7418) grad_norm: 0.3855 (0.4752) time: 5.5658 data: 0.0001 max mem: 71357 -[08:01:28.737895] Epoch: [3] [5440/6500] lr: 0.000005 closs: 0.7487 (0.7419) grad_norm: 0.3855 (0.4751) time: 5.5757 data: 0.0001 max mem: 71357 -[08:02:24.357380] Epoch: [3] [5450/6500] lr: 0.000005 closs: 0.7460 (0.7419) grad_norm: 0.3975 (0.4751) time: 5.5707 data: 0.0001 max mem: 71357 -[08:03:20.013233] Epoch: [3] [5460/6500] lr: 0.000005 closs: 0.7460 (0.7419) grad_norm: 0.4548 (0.4752) time: 5.5637 data: 0.0001 max mem: 71357 -[08:04:15.677277] Epoch: [3] [5470/6500] lr: 0.000005 closs: 0.8010 (0.7420) grad_norm: 0.4553 (0.4751) time: 5.5659 data: 0.0001 max mem: 71357 -[08:05:11.472975] Epoch: [3] [5480/6500] lr: 0.000005 closs: 0.7603 (0.7419) grad_norm: 0.4553 (0.4752) time: 5.5729 data: 0.0001 max mem: 71357 -[08:06:07.124938] Epoch: [3] [5490/6500] lr: 0.000005 closs: 0.7009 (0.7418) grad_norm: 0.4387 (0.4750) time: 5.5723 data: 0.0001 max mem: 71357 -[08:07:02.722427] Epoch: [3] [5500/6500] lr: 0.000005 closs: 0.7009 (0.7417) grad_norm: 0.4303 (0.4751) time: 5.5624 data: 0.0001 max mem: 71357 -[08:07:58.375710] Epoch: [3] [5510/6500] lr: 0.000005 closs: 0.6630 (0.7415) grad_norm: 0.4247 (0.4758) time: 5.5625 data: 0.0001 max mem: 71357 -[08:08:54.297860] Epoch: [3] [5520/6500] lr: 0.000005 closs: 0.7243 (0.7415) grad_norm: 0.4047 (0.4758) time: 5.5787 data: 0.0001 max mem: 71357 -[08:09:49.997283] Epoch: [3] [5530/6500] lr: 0.000005 closs: 0.7034 (0.7413) grad_norm: 0.4303 (0.4758) time: 5.5810 data: 0.0001 max mem: 71357 -[08:10:45.674960] Epoch: [3] [5540/6500] lr: 0.000005 closs: 0.7039 (0.7413) grad_norm: 0.4179 (0.4757) time: 5.5688 data: 0.0001 max mem: 71357 -[08:11:41.302248] Epoch: [3] [5550/6500] lr: 0.000005 closs: 0.7121 (0.7414) grad_norm: 0.4179 (0.4758) time: 5.5652 data: 0.0001 max mem: 71357 -[08:12:37.151238] Epoch: [3] [5560/6500] lr: 0.000005 closs: 0.6410 (0.7412) grad_norm: 0.4524 (0.4762) time: 5.5737 data: 0.0001 max mem: 71357 -[08:13:32.880058] Epoch: [3] [5570/6500] lr: 0.000005 closs: 0.6737 (0.7413) grad_norm: 0.4524 (0.4761) time: 5.5788 data: 0.0001 max mem: 71357 -[08:14:28.497505] Epoch: [3] [5580/6500] lr: 0.000005 closs: 0.7277 (0.7412) grad_norm: 0.4449 (0.4760) time: 5.5673 data: 0.0001 max mem: 71357 -[08:15:24.123502] Epoch: [3] [5590/6500] lr: 0.000005 closs: 0.7041 (0.7412) grad_norm: 0.4449 (0.4761) time: 5.5621 data: 0.0001 max mem: 71357 -[08:16:19.713976] Epoch: [3] [5600/6500] lr: 0.000005 closs: 0.7092 (0.7412) grad_norm: 0.3970 (0.4760) time: 5.5607 data: 0.0001 max mem: 71357 -[08:17:15.578612] Epoch: [3] [5610/6500] lr: 0.000005 closs: 0.7257 (0.7411) grad_norm: 0.3970 (0.4760) time: 5.5727 data: 0.0001 max mem: 71357 -[08:18:11.270830] Epoch: [3] [5620/6500] lr: 0.000005 closs: 0.7130 (0.7411) grad_norm: 0.4082 (0.4759) time: 5.5778 data: 0.0001 max mem: 71357 -[08:19:07.047039] Epoch: [3] [5630/6500] lr: 0.000005 closs: 0.6523 (0.7411) grad_norm: 0.4421 (0.4759) time: 5.5734 data: 0.0001 max mem: 71357 -[08:20:02.685791] Epoch: [3] [5640/6500] lr: 0.000005 closs: 0.6917 (0.7410) grad_norm: 0.4677 (0.4758) time: 5.5707 data: 0.0001 max mem: 71357 -[08:20:58.580036] Epoch: [3] [5650/6500] lr: 0.000005 closs: 0.7371 (0.7411) grad_norm: 0.4498 (0.4758) time: 5.5766 data: 0.0001 max mem: 71357 -[08:21:54.213166] Epoch: [3] [5660/6500] lr: 0.000005 closs: 0.7371 (0.7412) grad_norm: 0.4677 (0.4759) time: 5.5763 data: 0.0001 max mem: 71357 -[08:22:49.950783] Epoch: [3] [5670/6500] lr: 0.000005 closs: 0.7466 (0.7413) grad_norm: 0.3905 (0.4758) time: 5.5684 data: 0.0001 max mem: 71357 -[08:23:45.705757] Epoch: [3] [5680/6500] lr: 0.000005 closs: 0.7321 (0.7412) grad_norm: 0.4498 (0.4762) time: 5.5745 data: 0.0001 max mem: 71357 -[08:24:41.256843] Epoch: [3] [5690/6500] lr: 0.000005 closs: 0.6601 (0.7411) grad_norm: 0.4623 (0.4764) time: 5.5652 data: 0.0001 max mem: 71357 -[08:25:37.016730] Epoch: [3] [5700/6500] lr: 0.000005 closs: 0.6825 (0.7410) grad_norm: 0.4525 (0.4763) time: 5.5655 data: 0.0001 max mem: 71357 -[08:26:32.785310] Epoch: [3] [5710/6500] lr: 0.000005 closs: 0.6972 (0.7410) grad_norm: 0.4525 (0.4762) time: 5.5763 data: 0.0001 max mem: 71357 -[08:27:28.440180] Epoch: [3] [5720/6500] lr: 0.000005 closs: 0.7447 (0.7411) grad_norm: 0.4266 (0.4761) time: 5.5711 data: 0.0001 max mem: 71357 -[08:28:24.134165] Epoch: [3] [5730/6500] lr: 0.000005 closs: 0.7670 (0.7412) grad_norm: 0.4178 (0.4761) time: 5.5674 data: 0.0001 max mem: 71357 -[08:29:19.997220] Epoch: [3] [5740/6500] lr: 0.000005 closs: 0.7214 (0.7410) grad_norm: 0.4178 (0.4760) time: 5.5778 data: 0.0001 max mem: 71357 -[08:30:15.733399] Epoch: [3] [5750/6500] lr: 0.000005 closs: 0.6831 (0.7409) grad_norm: 0.4367 (0.4761) time: 5.5799 data: 0.0001 max mem: 71357 -[08:31:11.458578] Epoch: [3] [5760/6500] lr: 0.000005 closs: 0.6616 (0.7408) grad_norm: 0.4367 (0.4761) time: 5.5730 data: 0.0001 max mem: 71357 -[08:32:07.031492] Epoch: [3] [5770/6500] lr: 0.000005 closs: 0.7017 (0.7408) grad_norm: 0.5017 (0.4761) time: 5.5648 data: 0.0001 max mem: 71357 -[08:33:02.681051] Epoch: [3] [5780/6500] lr: 0.000005 closs: 0.7020 (0.7408) grad_norm: 0.4604 (0.4761) time: 5.5610 data: 0.0001 max mem: 71357 -[08:33:58.421784] Epoch: [3] [5790/6500] lr: 0.000005 closs: 0.7253 (0.7408) grad_norm: 0.4554 (0.4763) time: 5.5695 data: 0.0001 max mem: 71357 -[08:34:54.273999] Epoch: [3] [5800/6500] lr: 0.000005 closs: 0.7283 (0.7408) grad_norm: 0.4415 (0.4762) time: 5.5796 data: 0.0001 max mem: 71357 -[08:35:49.986871] Epoch: [3] [5810/6500] lr: 0.000005 closs: 0.7905 (0.7410) grad_norm: 0.4331 (0.4761) time: 5.5782 data: 0.0001 max mem: 71357 -[08:36:45.654555] Epoch: [3] [5820/6500] lr: 0.000005 closs: 0.7656 (0.7409) grad_norm: 0.4289 (0.4761) time: 5.5690 data: 0.0001 max mem: 71357 -[08:37:41.471240] Epoch: [3] [5830/6500] lr: 0.000005 closs: 0.6852 (0.7408) grad_norm: 0.3691 (0.4759) time: 5.5741 data: 0.0001 max mem: 71357 -[08:38:37.093389] Epoch: [3] [5840/6500] lr: 0.000005 closs: 0.6992 (0.7408) grad_norm: 0.3979 (0.4759) time: 5.5719 data: 0.0001 max mem: 71357 -[08:39:32.812807] Epoch: [3] [5850/6500] lr: 0.000005 closs: 0.7118 (0.7407) grad_norm: 0.3971 (0.4758) time: 5.5670 data: 0.0001 max mem: 71357 -[08:40:28.562085] Epoch: [3] [5860/6500] lr: 0.000005 closs: 0.7175 (0.7408) grad_norm: 0.3971 (0.4757) time: 5.5734 data: 0.0001 max mem: 71357 -[08:41:24.343575] Epoch: [3] [5870/6500] lr: 0.000005 closs: 0.7244 (0.7408) grad_norm: 0.4018 (0.4756) time: 5.5765 data: 0.0001 max mem: 71357 -[08:42:20.067075] Epoch: [3] [5880/6500] lr: 0.000005 closs: 0.7244 (0.7408) grad_norm: 0.4018 (0.4757) time: 5.5752 data: 0.0001 max mem: 71357 -[08:43:15.679788] Epoch: [3] [5890/6500] lr: 0.000005 closs: 0.7645 (0.7409) grad_norm: 0.4559 (0.4758) time: 5.5667 data: 0.0001 max mem: 71357 -[08:44:11.290632] Epoch: [3] [5900/6500] lr: 0.000005 closs: 0.7315 (0.7409) grad_norm: 0.4684 (0.4758) time: 5.5611 data: 0.0001 max mem: 71357 -[08:45:06.943686] Epoch: [3] [5910/6500] lr: 0.000005 closs: 0.7315 (0.7409) grad_norm: 0.4615 (0.4757) time: 5.5631 data: 0.0001 max mem: 71357 -[08:46:02.654843] Epoch: [3] [5920/6500] lr: 0.000005 closs: 0.7520 (0.7409) grad_norm: 0.4684 (0.4758) time: 5.5681 data: 0.0001 max mem: 71357 -[08:46:58.263460] Epoch: [3] [5930/6500] lr: 0.000005 closs: 0.7815 (0.7410) grad_norm: 0.4616 (0.4759) time: 5.5659 data: 0.0001 max mem: 71357 -[08:47:53.923587] Epoch: [3] [5940/6500] lr: 0.000005 closs: 0.7815 (0.7410) grad_norm: 0.4616 (0.4763) time: 5.5633 data: 0.0001 max mem: 71357 -[08:48:49.647222] Epoch: [3] [5950/6500] lr: 0.000005 closs: 0.7723 (0.7411) grad_norm: 0.4731 (0.4763) time: 5.5691 data: 0.0001 max mem: 71357 -[08:49:45.438272] Epoch: [3] [5960/6500] lr: 0.000005 closs: 0.6831 (0.7408) grad_norm: 0.4702 (0.4763) time: 5.5757 data: 0.0001 max mem: 71357 -[08:50:41.080388] Epoch: [3] [5970/6500] lr: 0.000005 closs: 0.7002 (0.7408) grad_norm: 0.4085 (0.4761) time: 5.5716 data: 0.0001 max mem: 71357 -[08:51:36.713518] Epoch: [3] [5980/6500] lr: 0.000005 closs: 0.7981 (0.7411) grad_norm: 0.4085 (0.4763) time: 5.5637 data: 0.0001 max mem: 71357 -[08:52:32.360209] Epoch: [3] [5990/6500] lr: 0.000005 closs: 0.8063 (0.7411) grad_norm: 0.3989 (0.4761) time: 5.5639 data: 0.0001 max mem: 71357 -[08:53:28.046393] Epoch: [3] [6000/6500] lr: 0.000005 closs: 0.7429 (0.7410) grad_norm: 0.4288 (0.4762) time: 5.5666 data: 0.0001 max mem: 71357 -[08:54:23.776691] Epoch: [3] [6010/6500] lr: 0.000005 closs: 0.7170 (0.7410) grad_norm: 0.4823 (0.4762) time: 5.5707 data: 0.0001 max mem: 71357 -[08:55:19.330299] Epoch: [3] [6020/6500] lr: 0.000005 closs: 0.6639 (0.7408) grad_norm: 0.4531 (0.4762) time: 5.5641 data: 0.0001 max mem: 71357 -[08:56:15.081992] Epoch: [3] [6030/6500] lr: 0.000005 closs: 0.6893 (0.7408) grad_norm: 0.4349 (0.4761) time: 5.5652 data: 0.0001 max mem: 71357 -[08:57:10.718534] Epoch: [3] [6040/6500] lr: 0.000005 closs: 0.7329 (0.7409) grad_norm: 0.3941 (0.4765) time: 5.5693 data: 0.0001 max mem: 71357 -[08:58:06.494709] Epoch: [3] [6050/6500] lr: 0.000005 closs: 0.8222 (0.7411) grad_norm: 0.3941 (0.4766) time: 5.5705 data: 0.0001 max mem: 71357 -[08:59:02.188091] Epoch: [3] [6060/6500] lr: 0.000005 closs: 0.7643 (0.7410) grad_norm: 0.3920 (0.4764) time: 5.5734 data: 0.0001 max mem: 71357 -[08:59:57.800593] Epoch: [3] [6070/6500] lr: 0.000005 closs: 0.7363 (0.7410) grad_norm: 0.4194 (0.4764) time: 5.5652 data: 0.0001 max mem: 71357 -[09:00:53.339756] Epoch: [3] [6080/6500] lr: 0.000005 closs: 0.7633 (0.7411) grad_norm: 0.4194 (0.4763) time: 5.5575 data: 0.0001 max mem: 71357 -[09:01:49.113502] Epoch: [3] [6090/6500] lr: 0.000005 closs: 0.7977 (0.7411) grad_norm: 0.4062 (0.4764) time: 5.5656 data: 0.0001 max mem: 71357 -[09:02:44.848015] Epoch: [3] [6100/6500] lr: 0.000005 closs: 0.8059 (0.7412) grad_norm: 0.4194 (0.4763) time: 5.5754 data: 0.0001 max mem: 71357 -[09:03:40.394260] Epoch: [3] [6110/6500] lr: 0.000005 closs: 0.7635 (0.7411) grad_norm: 0.4153 (0.4764) time: 5.5640 data: 0.0001 max mem: 71357 -[09:04:36.100670] Epoch: [3] [6120/6500] lr: 0.000005 closs: 0.6694 (0.7411) grad_norm: 0.3990 (0.4763) time: 5.5625 data: 0.0001 max mem: 71357 -[09:05:31.751861] Epoch: [3] [6130/6500] lr: 0.000005 closs: 0.7213 (0.7412) grad_norm: 0.4238 (0.4762) time: 5.5678 data: 0.0001 max mem: 71357 -[09:06:27.636239] Epoch: [3] [6140/6500] lr: 0.000005 closs: 0.7295 (0.7412) grad_norm: 0.4095 (0.4760) time: 5.5767 data: 0.0001 max mem: 71357 -[09:07:23.235152] Epoch: [3] [6150/6500] lr: 0.000005 closs: 0.7065 (0.7412) grad_norm: 0.4102 (0.4761) time: 5.5741 data: 0.0001 max mem: 71357 -[09:08:18.818036] Epoch: [3] [6160/6500] lr: 0.000005 closs: 0.7877 (0.7413) grad_norm: 0.4559 (0.4763) time: 5.5590 data: 0.0001 max mem: 71357 -[09:09:14.505759] Epoch: [3] [6170/6500] lr: 0.000005 closs: 0.7101 (0.7411) grad_norm: 0.4395 (0.4762) time: 5.5634 data: 0.0001 max mem: 71357 -[09:10:10.350422] Epoch: [3] [6180/6500] lr: 0.000005 closs: 0.6311 (0.7409) grad_norm: 0.5068 (0.4763) time: 5.5765 data: 0.0001 max mem: 71357 -[09:11:06.023015] Epoch: [3] [6190/6500] lr: 0.000005 closs: 0.7146 (0.7410) grad_norm: 0.4877 (0.4762) time: 5.5758 data: 0.0001 max mem: 71357 -[09:12:01.785870] Epoch: [3] [6200/6500] lr: 0.000005 closs: 0.7406 (0.7411) grad_norm: 0.4877 (0.4763) time: 5.5717 data: 0.0001 max mem: 71357 -[09:12:57.440583] Epoch: [3] [6210/6500] lr: 0.000005 closs: 0.7689 (0.7412) grad_norm: 0.4298 (0.4761) time: 5.5708 data: 0.0001 max mem: 71357 -[09:13:53.257089] Epoch: [3] [6220/6500] lr: 0.000005 closs: 0.7652 (0.7412) grad_norm: 0.4117 (0.4761) time: 5.5735 data: 0.0001 max mem: 71357 -[09:14:48.956788] Epoch: [3] [6230/6500] lr: 0.000005 closs: 0.7270 (0.7411) grad_norm: 0.4298 (0.4761) time: 5.5757 data: 0.0001 max mem: 71357 -[09:15:44.591134] Epoch: [3] [6240/6500] lr: 0.000005 closs: 0.7631 (0.7412) grad_norm: 0.4232 (0.4763) time: 5.5666 data: 0.0001 max mem: 71357 -[09:16:40.191312] Epoch: [3] [6250/6500] lr: 0.000005 closs: 0.8537 (0.7414) grad_norm: 0.4668 (0.4763) time: 5.5617 data: 0.0001 max mem: 71357 -[09:17:35.838897] Epoch: [3] [6260/6500] lr: 0.000005 closs: 0.8001 (0.7414) grad_norm: 0.4668 (0.4763) time: 5.5623 data: 0.0001 max mem: 71357 -[09:18:31.640816] Epoch: [3] [6270/6500] lr: 0.000005 closs: 0.7339 (0.7415) grad_norm: 0.4230 (0.4763) time: 5.5724 data: 0.0001 max mem: 71357 -[09:19:27.206966] Epoch: [3] [6280/6500] lr: 0.000005 closs: 0.7339 (0.7414) grad_norm: 0.4230 (0.4763) time: 5.5683 data: 0.0001 max mem: 71357 -[09:20:22.913908] Epoch: [3] [6290/6500] lr: 0.000005 closs: 0.7190 (0.7414) grad_norm: 0.4122 (0.4762) time: 5.5636 data: 0.0001 max mem: 71357 -[09:21:18.497214] Epoch: [3] [6300/6500] lr: 0.000005 closs: 0.7719 (0.7415) grad_norm: 0.4122 (0.4762) time: 5.5644 data: 0.0001 max mem: 71357 -[09:22:14.290866] Epoch: [3] [6310/6500] lr: 0.000005 closs: 0.7057 (0.7415) grad_norm: 0.4122 (0.4761) time: 5.5688 data: 0.0001 max mem: 71357 -[09:23:09.931856] Epoch: [3] [6320/6500] lr: 0.000005 closs: 0.6686 (0.7414) grad_norm: 0.4850 (0.4765) time: 5.5717 data: 0.0001 max mem: 71357 -[09:24:05.612801] Epoch: [3] [6330/6500] lr: 0.000005 closs: 0.6958 (0.7414) grad_norm: 0.4899 (0.4765) time: 5.5660 data: 0.0001 max mem: 71357 -[09:25:01.244089] Epoch: [3] [6340/6500] lr: 0.000005 closs: 0.7575 (0.7415) grad_norm: 0.4416 (0.4764) time: 5.5655 data: 0.0001 max mem: 71357 -[09:25:56.860737] Epoch: [3] [6350/6500] lr: 0.000005 closs: 0.7575 (0.7415) grad_norm: 0.4298 (0.4763) time: 5.5623 data: 0.0001 max mem: 71357 -[09:26:52.670083] Epoch: [3] [6360/6500] lr: 0.000005 closs: 0.7802 (0.7416) grad_norm: 0.4094 (0.4762) time: 5.5712 data: 0.0001 max mem: 71357 -[09:27:48.219140] Epoch: [3] [6370/6500] lr: 0.000005 closs: 0.7762 (0.7416) grad_norm: 0.4253 (0.4763) time: 5.5678 data: 0.0001 max mem: 71357 -[09:28:43.931513] Epoch: [3] [6380/6500] lr: 0.000005 closs: 0.7411 (0.7417) grad_norm: 0.4253 (0.4762) time: 5.5630 data: 0.0001 max mem: 71357 -[09:29:39.533688] Epoch: [3] [6390/6500] lr: 0.000005 closs: 0.7411 (0.7417) grad_norm: 0.4533 (0.4762) time: 5.5656 data: 0.0001 max mem: 71357 -[09:30:35.257343] Epoch: [3] [6400/6500] lr: 0.000005 closs: 0.7405 (0.7417) grad_norm: 0.4680 (0.4762) time: 5.5662 data: 0.0001 max mem: 71357 -[09:31:30.993736] Epoch: [3] [6410/6500] lr: 0.000005 closs: 0.7159 (0.7417) grad_norm: 0.4596 (0.4761) time: 5.5729 data: 0.0001 max mem: 71357 -[09:32:26.749437] Epoch: [3] [6420/6500] lr: 0.000005 closs: 0.7601 (0.7419) grad_norm: 0.4596 (0.4761) time: 5.5745 data: 0.0001 max mem: 71357 -[09:33:22.454703] Epoch: [3] [6430/6500] lr: 0.000005 closs: 0.7906 (0.7420) grad_norm: 0.4588 (0.4761) time: 5.5730 data: 0.0001 max mem: 71357 -[09:34:18.165306] Epoch: [3] [6440/6500] lr: 0.000005 closs: 0.7978 (0.7421) grad_norm: 0.4588 (0.4762) time: 5.5707 data: 0.0001 max mem: 71357 -[09:35:13.814875] Epoch: [3] [6450/6500] lr: 0.000005 closs: 0.8280 (0.7422) grad_norm: 0.4473 (0.4763) time: 5.5679 data: 0.0001 max mem: 71357 -[09:36:09.491251] Epoch: [3] [6460/6500] lr: 0.000005 closs: 0.7346 (0.7422) grad_norm: 0.4588 (0.4768) time: 5.5662 data: 0.0001 max mem: 71357 -[09:37:05.187478] Epoch: [3] [6470/6500] lr: 0.000005 closs: 0.7346 (0.7421) grad_norm: 0.4473 (0.4767) time: 5.5685 data: 0.0001 max mem: 71357 -[09:38:00.919941] Epoch: [3] [6480/6500] lr: 0.000005 closs: 0.6989 (0.7421) grad_norm: 0.4163 (0.4766) time: 5.5713 data: 0.0001 max mem: 71357 -[09:38:56.716451] Epoch: [3] [6490/6500] lr: 0.000005 closs: 0.7505 (0.7422) grad_norm: 0.4275 (0.4767) time: 5.5763 data: 0.0001 max mem: 71357 -[09:39:47.308149] Epoch: [3] Total time: 10:03:39 -[09:39:47.344265] Averaged stats: lr: 0.000005 closs: 0.7505 (0.7420) grad_norm: 0.4643 (0.4769) -[09:39:47.504858] model saved -[09:39:48.501829] optimizer saved -[09:39:48.502389] other rank-common saved -[09:39:48.505739] rank-specific saved -[09:39:48.505904] Training time 1 day, 16:16:17