diff --git a/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B_lr1e-4/epoch0/consolidated.00-of-01.model.pth b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B_lr1e-4/epoch0/consolidated.00-of-01.model.pth new file mode 100644 index 0000000000000000000000000000000000000000..e6ab1a9f90df742fbbeaaff57f991959b09536a9 --- /dev/null +++ b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B_lr1e-4/epoch0/consolidated.00-of-01.model.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5c2bc3748c8253570b7f39e83b85b98efc58dcc9d578c70d964eff23c0758e79 +size 90952079 diff --git a/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B_lr1e-4/epoch0/consolidated.00-of-01.optimizer.pth b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B_lr1e-4/epoch0/consolidated.00-of-01.optimizer.pth new file mode 100644 index 0000000000000000000000000000000000000000..873c2d5e2a0e5d11184ce80750cb6df975ae3062 --- /dev/null +++ b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B_lr1e-4/epoch0/consolidated.00-of-01.optimizer.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:47c860eebadd34a2340393d7937edd7b8f916dc12b6cae7486098680d99b71ca +size 204403795 diff --git a/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B_lr1e-4/epoch0/consolidated.00-of-01.other.pth b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B_lr1e-4/epoch0/consolidated.00-of-01.other.pth new file mode 100644 index 0000000000000000000000000000000000000000..669cd774b329a01f638aeb2c9f895a064804cad8 --- /dev/null +++ b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B_lr1e-4/epoch0/consolidated.00-of-01.other.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:11832166d9d4ff336967be7a3443fe621f59525b7288833d8f702a8055db6a4b +size 1815 diff --git a/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B_lr1e-4/epoch0/rank-specific-00000-of-00008.pth b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B_lr1e-4/epoch0/rank-specific-00000-of-00008.pth new file mode 100644 index 0000000000000000000000000000000000000000..52b71af1a9ce3ed182e1185cac54dc42f12a5fb6 --- /dev/null +++ b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B_lr1e-4/epoch0/rank-specific-00000-of-00008.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ec2932635da1a4de71c34aa8fcbcba91dfb0ac1ddc7859f8f87280546b7e786a +size 537 diff --git a/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B_lr1e-4/epoch0/rank-specific-00001-of-00008.pth b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B_lr1e-4/epoch0/rank-specific-00001-of-00008.pth new file mode 100644 index 0000000000000000000000000000000000000000..20d239dfd49c5dfac4b0e9262df10a199c383e22 --- /dev/null +++ b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B_lr1e-4/epoch0/rank-specific-00001-of-00008.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:88973b3c418b507bcde1467ec3902218b83d95fe4e022aca11b09c3f86cde7ac +size 537 diff --git a/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B_lr1e-4/epoch0/rank-specific-00002-of-00008.pth b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B_lr1e-4/epoch0/rank-specific-00002-of-00008.pth new file mode 100644 index 0000000000000000000000000000000000000000..44d15a9615f46731b4d1be2302ed11c2e22c5889 --- /dev/null +++ b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B_lr1e-4/epoch0/rank-specific-00002-of-00008.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:eee15a274ea5f27c0360c85bd878d6e0f2072076cae26311c52798f7d836643a +size 537 diff --git a/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B_lr1e-4/epoch0/rank-specific-00003-of-00008.pth b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B_lr1e-4/epoch0/rank-specific-00003-of-00008.pth new file mode 100644 index 0000000000000000000000000000000000000000..c02a05b764b46a3e2ea7f50bab8449d0128a76d9 --- /dev/null +++ b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B_lr1e-4/epoch0/rank-specific-00003-of-00008.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:61651d612914693bf494e5609388a6f9239090c45b3abcc9c4fa5c7a814c7a7e +size 537 diff --git a/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B_lr1e-4/epoch0/rank-specific-00004-of-00008.pth b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B_lr1e-4/epoch0/rank-specific-00004-of-00008.pth new file mode 100644 index 0000000000000000000000000000000000000000..f9bdc7b095dfaed08b7ebb500fa76f2562a86c2c --- /dev/null +++ b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B_lr1e-4/epoch0/rank-specific-00004-of-00008.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8cd6ad8f3d2bcfa25c957717227143e64751970f9b367b28b205a5084a8f476a +size 537 diff --git a/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B_lr1e-4/epoch0/rank-specific-00005-of-00008.pth b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B_lr1e-4/epoch0/rank-specific-00005-of-00008.pth new file mode 100644 index 0000000000000000000000000000000000000000..93470a083d27c6e079dfb735e0a4fa8b7f6b0249 --- /dev/null +++ b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B_lr1e-4/epoch0/rank-specific-00005-of-00008.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bf049e1944a87da00e6860d1884d0eb312dc5a389a832a4e76a582493ec26972 +size 537 diff --git a/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B_lr1e-4/epoch0/rank-specific-00006-of-00008.pth b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B_lr1e-4/epoch0/rank-specific-00006-of-00008.pth new file mode 100644 index 0000000000000000000000000000000000000000..90e3ca8659ab49b709193c41ea8923e9f7217d09 --- /dev/null +++ b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B_lr1e-4/epoch0/rank-specific-00006-of-00008.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8174e84cf8a0553f73baf42bd13d65974b85944a834fa7f75433c0be044e2f04 +size 537 diff --git a/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B_lr1e-4/epoch0/rank-specific-00007-of-00008.pth b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B_lr1e-4/epoch0/rank-specific-00007-of-00008.pth new file mode 100644 index 0000000000000000000000000000000000000000..6530350b10d02e206562d6d0b29a46a26d742899 --- /dev/null +++ b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B_lr1e-4/epoch0/rank-specific-00007-of-00008.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fb6f9198ace60febfc0ad5d85588a3d4021799762f521c1a6b87adc99c8889ce +size 537 diff --git a/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B_lr1e-4/epoch1/consolidated.00-of-01.model.pth b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B_lr1e-4/epoch1/consolidated.00-of-01.model.pth new file mode 100644 index 0000000000000000000000000000000000000000..f4fa7f11285853eb02a4d8cde1c2b03fa67d7f60 --- /dev/null +++ b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B_lr1e-4/epoch1/consolidated.00-of-01.model.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c4abed9d7d9977796a1f57677a44751bf53987df68bcd07387f76938dd12242b +size 90952079 diff --git a/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B_lr1e-4/epoch1/consolidated.00-of-01.optimizer.pth b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B_lr1e-4/epoch1/consolidated.00-of-01.optimizer.pth new file mode 100644 index 0000000000000000000000000000000000000000..47050520ca1f588b641aae8a1fad6667ce2e3d52 --- /dev/null +++ b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B_lr1e-4/epoch1/consolidated.00-of-01.optimizer.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d849554de5927f9602e5c338b3f5129ede71c30c49491ce3c80a38db97b97b6a +size 204403795 diff --git a/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B_lr1e-4/epoch1/consolidated.00-of-01.other.pth b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B_lr1e-4/epoch1/consolidated.00-of-01.other.pth new file mode 100644 index 0000000000000000000000000000000000000000..9a72b8f0c8ae208a697aa2072777c23dfac1e589 --- /dev/null +++ b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B_lr1e-4/epoch1/consolidated.00-of-01.other.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:95adb9a94ac0f196f54dab790f14bfc69fd17b28e0ac05fc63d5330038b53d1f +size 1815 diff --git a/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B_lr1e-4/epoch1/rank-specific-00000-of-00008.pth b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B_lr1e-4/epoch1/rank-specific-00000-of-00008.pth new file mode 100644 index 0000000000000000000000000000000000000000..52b71af1a9ce3ed182e1185cac54dc42f12a5fb6 --- /dev/null +++ b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B_lr1e-4/epoch1/rank-specific-00000-of-00008.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ec2932635da1a4de71c34aa8fcbcba91dfb0ac1ddc7859f8f87280546b7e786a +size 537 diff --git a/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B_lr1e-4/epoch1/rank-specific-00001-of-00008.pth b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B_lr1e-4/epoch1/rank-specific-00001-of-00008.pth new file mode 100644 index 0000000000000000000000000000000000000000..20d239dfd49c5dfac4b0e9262df10a199c383e22 --- /dev/null +++ b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B_lr1e-4/epoch1/rank-specific-00001-of-00008.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:88973b3c418b507bcde1467ec3902218b83d95fe4e022aca11b09c3f86cde7ac +size 537 diff --git a/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B_lr1e-4/epoch1/rank-specific-00002-of-00008.pth b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B_lr1e-4/epoch1/rank-specific-00002-of-00008.pth new file mode 100644 index 0000000000000000000000000000000000000000..44d15a9615f46731b4d1be2302ed11c2e22c5889 --- /dev/null +++ b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B_lr1e-4/epoch1/rank-specific-00002-of-00008.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:eee15a274ea5f27c0360c85bd878d6e0f2072076cae26311c52798f7d836643a +size 537 diff --git a/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B_lr1e-4/epoch1/rank-specific-00003-of-00008.pth b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B_lr1e-4/epoch1/rank-specific-00003-of-00008.pth new file mode 100644 index 0000000000000000000000000000000000000000..c02a05b764b46a3e2ea7f50bab8449d0128a76d9 --- /dev/null +++ b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B_lr1e-4/epoch1/rank-specific-00003-of-00008.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:61651d612914693bf494e5609388a6f9239090c45b3abcc9c4fa5c7a814c7a7e +size 537 diff --git a/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B_lr1e-4/epoch1/rank-specific-00004-of-00008.pth b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B_lr1e-4/epoch1/rank-specific-00004-of-00008.pth new file mode 100644 index 0000000000000000000000000000000000000000..f9bdc7b095dfaed08b7ebb500fa76f2562a86c2c --- /dev/null +++ b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B_lr1e-4/epoch1/rank-specific-00004-of-00008.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8cd6ad8f3d2bcfa25c957717227143e64751970f9b367b28b205a5084a8f476a +size 537 diff --git a/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B_lr1e-4/epoch1/rank-specific-00005-of-00008.pth b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B_lr1e-4/epoch1/rank-specific-00005-of-00008.pth new file mode 100644 index 0000000000000000000000000000000000000000..93470a083d27c6e079dfb735e0a4fa8b7f6b0249 --- /dev/null +++ b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B_lr1e-4/epoch1/rank-specific-00005-of-00008.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bf049e1944a87da00e6860d1884d0eb312dc5a389a832a4e76a582493ec26972 +size 537 diff --git a/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B_lr1e-4/epoch1/rank-specific-00006-of-00008.pth b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B_lr1e-4/epoch1/rank-specific-00006-of-00008.pth new file mode 100644 index 0000000000000000000000000000000000000000..90e3ca8659ab49b709193c41ea8923e9f7217d09 --- /dev/null +++ b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B_lr1e-4/epoch1/rank-specific-00006-of-00008.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8174e84cf8a0553f73baf42bd13d65974b85944a834fa7f75433c0be044e2f04 +size 537 diff --git a/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B_lr1e-4/epoch1/rank-specific-00007-of-00008.pth b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B_lr1e-4/epoch1/rank-specific-00007-of-00008.pth new file mode 100644 index 0000000000000000000000000000000000000000..6530350b10d02e206562d6d0b29a46a26d742899 --- /dev/null +++ b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B_lr1e-4/epoch1/rank-specific-00007-of-00008.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fb6f9198ace60febfc0ad5d85588a3d4021799762f521c1a6b87adc99c8889ce +size 537 diff --git a/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B_lr1e-4/epoch2/consolidated.00-of-01.model.pth b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B_lr1e-4/epoch2/consolidated.00-of-01.model.pth new file mode 100644 index 0000000000000000000000000000000000000000..79297a203c019abd4e9dee80083eaa1bd809ddec --- /dev/null +++ b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B_lr1e-4/epoch2/consolidated.00-of-01.model.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2ef65a1c31775b30f1ca6689ee1775bc46797f24774e3ba5f6c519191b300041 +size 90952079 diff --git a/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B_lr1e-4/epoch2/consolidated.00-of-01.optimizer.pth b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B_lr1e-4/epoch2/consolidated.00-of-01.optimizer.pth new file mode 100644 index 0000000000000000000000000000000000000000..99234b7cc0f09d9206b5cfdf92cf59852bd365e5 --- /dev/null +++ b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B_lr1e-4/epoch2/consolidated.00-of-01.optimizer.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f027a5cb40b85d2ad50263a5170c8cf5124b3298b2d9f3f2881679345b27be5b +size 204403795 diff --git a/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B_lr1e-4/epoch2/consolidated.00-of-01.other.pth b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B_lr1e-4/epoch2/consolidated.00-of-01.other.pth new file mode 100644 index 0000000000000000000000000000000000000000..eff1ae059eabc81c95811daac0c373442a99a167 --- /dev/null +++ b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B_lr1e-4/epoch2/consolidated.00-of-01.other.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:66313368fa7978c2341a231255af3cd5880b57c79fe18bd08e682f33a319765b +size 1815 diff --git a/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B_lr1e-4/epoch2/rank-specific-00000-of-00008.pth b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B_lr1e-4/epoch2/rank-specific-00000-of-00008.pth new file mode 100644 index 0000000000000000000000000000000000000000..52b71af1a9ce3ed182e1185cac54dc42f12a5fb6 --- /dev/null +++ b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B_lr1e-4/epoch2/rank-specific-00000-of-00008.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ec2932635da1a4de71c34aa8fcbcba91dfb0ac1ddc7859f8f87280546b7e786a +size 537 diff --git a/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B_lr1e-4/epoch2/rank-specific-00001-of-00008.pth b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B_lr1e-4/epoch2/rank-specific-00001-of-00008.pth new file mode 100644 index 0000000000000000000000000000000000000000..20d239dfd49c5dfac4b0e9262df10a199c383e22 --- /dev/null +++ b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B_lr1e-4/epoch2/rank-specific-00001-of-00008.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:88973b3c418b507bcde1467ec3902218b83d95fe4e022aca11b09c3f86cde7ac +size 537 diff --git a/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B_lr1e-4/epoch2/rank-specific-00002-of-00008.pth b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B_lr1e-4/epoch2/rank-specific-00002-of-00008.pth new file mode 100644 index 0000000000000000000000000000000000000000..44d15a9615f46731b4d1be2302ed11c2e22c5889 --- /dev/null +++ b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B_lr1e-4/epoch2/rank-specific-00002-of-00008.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:eee15a274ea5f27c0360c85bd878d6e0f2072076cae26311c52798f7d836643a +size 537 diff --git a/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B_lr1e-4/epoch2/rank-specific-00003-of-00008.pth b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B_lr1e-4/epoch2/rank-specific-00003-of-00008.pth new file mode 100644 index 0000000000000000000000000000000000000000..c02a05b764b46a3e2ea7f50bab8449d0128a76d9 --- /dev/null +++ b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B_lr1e-4/epoch2/rank-specific-00003-of-00008.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:61651d612914693bf494e5609388a6f9239090c45b3abcc9c4fa5c7a814c7a7e +size 537 diff --git a/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B_lr1e-4/epoch2/rank-specific-00004-of-00008.pth b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B_lr1e-4/epoch2/rank-specific-00004-of-00008.pth new file mode 100644 index 0000000000000000000000000000000000000000..f9bdc7b095dfaed08b7ebb500fa76f2562a86c2c --- /dev/null +++ b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B_lr1e-4/epoch2/rank-specific-00004-of-00008.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8cd6ad8f3d2bcfa25c957717227143e64751970f9b367b28b205a5084a8f476a +size 537 diff --git a/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B_lr1e-4/epoch2/rank-specific-00005-of-00008.pth b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B_lr1e-4/epoch2/rank-specific-00005-of-00008.pth new file mode 100644 index 0000000000000000000000000000000000000000..93470a083d27c6e079dfb735e0a4fa8b7f6b0249 --- /dev/null +++ b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B_lr1e-4/epoch2/rank-specific-00005-of-00008.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bf049e1944a87da00e6860d1884d0eb312dc5a389a832a4e76a582493ec26972 +size 537 diff --git a/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B_lr1e-4/epoch2/rank-specific-00006-of-00008.pth b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B_lr1e-4/epoch2/rank-specific-00006-of-00008.pth new file mode 100644 index 0000000000000000000000000000000000000000..90e3ca8659ab49b709193c41ea8923e9f7217d09 --- /dev/null +++ b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B_lr1e-4/epoch2/rank-specific-00006-of-00008.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8174e84cf8a0553f73baf42bd13d65974b85944a834fa7f75433c0be044e2f04 +size 537 diff --git a/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B_lr1e-4/epoch2/rank-specific-00007-of-00008.pth b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B_lr1e-4/epoch2/rank-specific-00007-of-00008.pth new file mode 100644 index 0000000000000000000000000000000000000000..6530350b10d02e206562d6d0b29a46a26d742899 --- /dev/null +++ b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B_lr1e-4/epoch2/rank-specific-00007-of-00008.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fb6f9198ace60febfc0ad5d85588a3d4021799762f521c1a6b87adc99c8889ce +size 537 diff --git a/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B_lr1e-4/log.txt b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B_lr1e-4/log.txt new file mode 100644 index 0000000000000000000000000000000000000000..250ab85f95ff9f0eae9e25c9b73f8a650ff8dd69 --- /dev/null +++ b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B_lr1e-4/log.txt @@ -0,0 +1,3 @@ +{"train_lr": 8.508628427162843e-05, "train_grad_norm": 0.7172298170107, "train_closs": 1.0847456962903907, "epoch": 0, "val_lr": 8.508628427162843e-05, "val_grad_norm": 0.7172298170107, "val_closs": 1.0847456962903907} +{"train_lr": 5.755148397302138e-05, "train_grad_norm": 0.5609518145953734, "train_closs": 1.0388896298569104, "epoch": 1, "val_lr": 5.755148397302138e-05, "val_grad_norm": 0.5609518145953734, "val_closs": 1.0388896298569104} +{"train_lr": 1.4361453685455227e-05, "train_grad_norm": 0.5708354987866298, "train_closs": 1.0262249000958887, "epoch": 2, "val_lr": 1.4361453685455227e-05, "val_grad_norm": 0.5708354987866298, "val_closs": 1.0262249000958887} diff --git a/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B_lr1e-4/output.log b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B_lr1e-4/output.log new file mode 100644 index 0000000000000000000000000000000000000000..d5e339d7f62f18caf95300266bf0613e86bc4fbd --- /dev/null +++ b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B_lr1e-4/output.log @@ -0,0 +1,4738 @@ +WARNING:torch.distributed.run: +***************************************** +Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +***************************************** +| distributed init (rank 5): env://, gpu 5 +| distributed init (rank 4): env://, gpu 4 +| distributed init (rank 7): env://, gpu 7 +| distributed init (rank 1): env://, gpu 1 +| distributed init (rank 6): env://, gpu 6 +| distributed init (rank 2): env://, gpu 2 +| distributed init (rank 3): env://, gpu 3 +| distributed init (rank 0): env://, gpu 0 +[22:49:15.994849] > initializing model parallel with size 1 +[22:49:15.994911] > initializing ddp with size 8 +[22:49:15.994917] > initializing pipeline with size 1 +[22:49:16.163185] job dir: /data/liuyijiang/mmlab/LLaMA2-Accessory/accessory +[22:49:16.163260] Namespace(batch_size=16, +accum_iter=1, +llama_type='llama_qformerv2_peft', +llama_config=['../checkpoints/llama2/Llama-2-13b/params.json', +'configs/model/finetune/sg/llamaPeft_normBiasLora.json'], +no_visual=False, +tokenizer_path='../checkpoints/llama2/Llama-2-13b/tokenizer.model', +pretrained_path='../checkpoints/mm/lamaQformerv2_13b/finetuned/', +pretrained_type='consolidated', +weight_decay=0.02, +lr=0.0001, +min_lr=5e-06, +epochs=3, +warmup_epochs=0.2, +clip_grad=2, +max_words=512, +dialog=False, +data_config='configs/data/finetune/mm/alpaca_llava.yaml', +output_dir='output/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B_lr1e-4', +log_dir='./output_dir', +save_interval=1, +only_save_trainable=True, +device='cuda', +seed=0, +resume='', +num_workers=16, +pin_mem=True, +world_size=8, +local_rank=-1, +dist_on_itp=False, +dist_url='env://', +model_parallel_size=1, +data_parallel='sdp', +precision='bf16', +checkpointing=True, +quant=True, +rank=0, +gpu=0, +distributed=True, +dist_backend='nccl') +[22:49:16.163895] Start initialization. +[22:49:16.163930] ## Processing on RANK 0. +[22:49:16.173919] Model Args: + ModelArgs(dim=5120, n_layers=40, n_heads=40, n_kv_heads=None, vocab_size=32000, multiple_of=256, ffn_dim_multiplier=None, norm_eps=1e-05, max_batch_size=32, max_seq_len=512, rope_scaling=None, lora_rank=16, bias_tuning=True) +[22:50:48.722048] build llama model with qformerv2 + Loading checkpoint shards: 0%| | 0/2 [00:00 +[23:22:27.864578] Start training for 3 epochs +[23:22:27.878956] log_dir: ./output_dir +[23:22:43.639857] Epoch: [0] [0/3229] lr: 0.000000 grad_norm: 2.3643 (2.3643) closs: 1.5947 (1.5947) time: 15.7601 data: 8.9894 max mem: 36209 +[23:23:23.795186] Epoch: [0] [10/3229] lr: 0.000002 grad_norm: 2.2598 (2.1838) closs: 1.3996 (1.3615) time: 5.0832 data: 0.8175 max mem: 54683 +[23:24:04.512812] Epoch: [0] [20/3229] lr: 0.000003 grad_norm: 2.2598 (2.2052) closs: 1.4486 (1.4404) time: 4.0436 data: 0.0002 max mem: 54683 +[23:24:44.625235] Epoch: [0] [30/3229] lr: 0.000005 grad_norm: 2.2136 (2.1731) closs: 1.4720 (1.4210) time: 4.0414 data: 0.0002 max mem: 54683 +[23:25:25.084742] Epoch: [0] [40/3229] lr: 0.000006 grad_norm: 2.0006 (2.0774) closs: 1.3830 (1.3968) time: 4.0285 data: 0.0002 max mem: 54683 +[23:26:06.204362] Epoch: [0] [50/3229] lr: 0.000008 grad_norm: 1.6879 (1.9920) closs: 1.3858 (1.4054) time: 4.0789 data: 0.0002 max mem: 54683 +[23:26:46.695153] Epoch: [0] [60/3229] lr: 0.000009 grad_norm: 1.5039 (1.8916) closs: 1.3927 (1.3882) time: 4.0804 data: 0.0002 max mem: 54683 +[23:27:27.187491] Epoch: [0] [70/3229] lr: 0.000011 grad_norm: 1.2135 (1.7928) closs: 1.2954 (1.3740) time: 4.0491 data: 0.0002 max mem: 54683 +[23:28:08.005236] Epoch: [0] [80/3229] lr: 0.000012 grad_norm: 1.0767 (1.7056) closs: 1.2613 (1.3602) time: 4.0654 data: 0.0002 max mem: 54683 +[23:28:48.149547] Epoch: [0] [90/3229] lr: 0.000014 grad_norm: 0.9057 (1.6181) closs: 1.2135 (1.3429) time: 4.0480 data: 0.0002 max mem: 54683 +[23:29:28.932772] Epoch: [0] [100/3229] lr: 0.000015 grad_norm: 0.8791 (1.5484) closs: 1.1981 (1.3292) time: 4.0463 data: 0.0002 max mem: 54683 +[23:30:09.405095] Epoch: [0] [110/3229] lr: 0.000017 grad_norm: 0.8511 (1.4835) closs: 1.2455 (1.3201) time: 4.0627 data: 0.0002 max mem: 54683 +[23:30:50.534293] Epoch: [0] [120/3229] lr: 0.000019 grad_norm: 0.8250 (1.4295) closs: 1.2488 (1.3134) time: 4.0800 data: 0.0002 max mem: 54683 +[23:31:30.701816] Epoch: [0] [130/3229] lr: 0.000020 grad_norm: 0.8259 (1.3840) closs: 1.2241 (1.2992) time: 4.0648 data: 0.0002 max mem: 54683 +[23:32:11.513620] Epoch: [0] [140/3229] lr: 0.000022 grad_norm: 0.8054 (1.3437) closs: 1.1745 (1.2890) time: 4.0489 data: 0.0002 max mem: 54683 +[23:32:52.318143] Epoch: [0] [150/3229] lr: 0.000023 grad_norm: 0.8054 (1.3094) closs: 1.1397 (1.2791) time: 4.0807 data: 0.0002 max mem: 54683 +[23:33:33.496187] Epoch: [0] [160/3229] lr: 0.000025 grad_norm: 0.7959 (1.2774) closs: 1.1462 (1.2732) time: 4.0991 data: 0.0002 max mem: 54683 +[23:34:14.627260] Epoch: [0] [170/3229] lr: 0.000026 grad_norm: 0.8027 (1.2542) closs: 1.1510 (1.2673) time: 4.1154 data: 0.0002 max mem: 54683 +[23:34:55.768947] Epoch: [0] [180/3229] lr: 0.000028 grad_norm: 0.8896 (1.2328) closs: 1.1858 (1.2634) time: 4.1136 data: 0.0002 max mem: 54683 +[23:35:36.915854] Epoch: [0] [190/3229] lr: 0.000029 grad_norm: 0.8623 (1.2141) closs: 1.2039 (1.2594) time: 4.1144 data: 0.0002 max mem: 54683 +[23:36:18.384153] Epoch: [0] [200/3229] lr: 0.000031 grad_norm: 0.8326 (1.1966) closs: 1.1849 (1.2539) time: 4.1307 data: 0.0002 max mem: 54683 +[23:36:59.200348] Epoch: [0] [210/3229] lr: 0.000033 grad_norm: 0.8326 (1.1812) closs: 1.1680 (1.2500) time: 4.1141 data: 0.0002 max mem: 54684 +[23:37:39.707942] Epoch: [0] [220/3229] lr: 0.000034 grad_norm: 0.8574 (1.1681) closs: 1.1718 (1.2469) time: 4.0661 data: 0.0002 max mem: 54684 +[23:38:20.825766] Epoch: [0] [230/3229] lr: 0.000036 grad_norm: 0.8711 (1.1558) closs: 1.1853 (1.2439) time: 4.0812 data: 0.0002 max mem: 54684 +[23:39:01.980002] Epoch: [0] [240/3229] lr: 0.000037 grad_norm: 0.8711 (1.1440) closs: 1.1805 (1.2402) time: 4.1135 data: 0.0002 max mem: 54684 +[23:39:42.136851] Epoch: [0] [250/3229] lr: 0.000039 grad_norm: 0.8573 (1.1322) closs: 1.1219 (1.2355) time: 4.0655 data: 0.0002 max mem: 54684 +[23:40:21.973104] Epoch: [0] [260/3229] lr: 0.000040 grad_norm: 0.8358 (1.1208) closs: 1.1161 (1.2306) time: 3.9996 data: 0.0002 max mem: 54684 +[23:41:02.441963] Epoch: [0] [270/3229] lr: 0.000042 grad_norm: 0.8472 (1.1130) closs: 1.1082 (1.2250) time: 4.0152 data: 0.0002 max mem: 54684 +[23:41:43.505784] Epoch: [0] [280/3229] lr: 0.000043 grad_norm: 0.8502 (1.1037) closs: 1.1082 (1.2214) time: 4.0766 data: 0.0002 max mem: 54684 +[23:42:23.986322] Epoch: [0] [290/3229] lr: 0.000045 grad_norm: 0.8341 (1.0958) closs: 1.1663 (1.2195) time: 4.0771 data: 0.0002 max mem: 54684 +[23:43:04.792000] Epoch: [0] [300/3229] lr: 0.000046 grad_norm: 0.9007 (1.0917) closs: 1.1762 (1.2175) time: 4.0642 data: 0.0002 max mem: 54684 +[23:43:44.938269] Epoch: [0] [310/3229] lr: 0.000048 grad_norm: 0.8587 (1.0835) closs: 1.1480 (1.2142) time: 4.0475 data: 0.0002 max mem: 54684 +[23:44:24.933818] Epoch: [0] [320/3229] lr: 0.000050 grad_norm: 0.8564 (1.0763) closs: 1.1218 (1.2100) time: 4.0070 data: 0.0002 max mem: 54684 +[23:45:05.719682] Epoch: [0] [330/3229] lr: 0.000051 grad_norm: 0.8780 (1.0704) closs: 1.1218 (1.2077) time: 4.0390 data: 0.0002 max mem: 54684 +[23:45:46.851537] Epoch: [0] [340/3229] lr: 0.000053 grad_norm: 0.8750 (1.0652) closs: 1.1309 (1.2051) time: 4.0958 data: 0.0002 max mem: 54684 +[23:46:27.990122] Epoch: [0] [350/3229] lr: 0.000054 grad_norm: 0.8628 (1.0599) closs: 1.1119 (1.2035) time: 4.1134 data: 0.0002 max mem: 54684 +[23:47:08.873601] Epoch: [0] [360/3229] lr: 0.000056 grad_norm: 0.8519 (1.0544) closs: 1.1044 (1.2005) time: 4.1010 data: 0.0002 max mem: 54684 +[23:47:49.692722] Epoch: [0] [370/3229] lr: 0.000057 grad_norm: 0.8766 (1.0503) closs: 1.1115 (1.1989) time: 4.0851 data: 0.0002 max mem: 54684 +[23:48:30.164462] Epoch: [0] [380/3229] lr: 0.000059 grad_norm: 0.8993 (1.0490) closs: 1.1255 (1.1965) time: 4.0645 data: 0.0002 max mem: 54684 +[23:49:10.664172] Epoch: [0] [390/3229] lr: 0.000060 grad_norm: 0.8927 (1.0446) closs: 1.1042 (1.1944) time: 4.0485 data: 0.0002 max mem: 54684 +[23:49:51.854914] Epoch: [0] [400/3229] lr: 0.000062 grad_norm: 0.8927 (1.0418) closs: 1.1041 (1.1918) time: 4.0844 data: 0.0002 max mem: 54684 +[23:50:32.019553] Epoch: [0] [410/3229] lr: 0.000063 grad_norm: 0.8438 (1.0362) closs: 1.0953 (1.1888) time: 4.0677 data: 0.0002 max mem: 54684 +[23:51:13.159641] Epoch: [0] [420/3229] lr: 0.000065 grad_norm: 0.8438 (1.0345) closs: 1.0719 (1.1871) time: 4.0652 data: 0.0002 max mem: 54684 +[23:51:53.971366] Epoch: [0] [430/3229] lr: 0.000067 grad_norm: 0.8894 (1.0317) closs: 1.0982 (1.1850) time: 4.0975 data: 0.0003 max mem: 54684 +[23:52:34.857552] Epoch: [0] [440/3229] lr: 0.000068 grad_norm: 0.8894 (1.0279) closs: 1.1149 (1.1830) time: 4.0848 data: 0.0003 max mem: 54684 +[23:53:15.996672] Epoch: [0] [450/3229] lr: 0.000070 grad_norm: 0.8831 (1.0249) closs: 1.1271 (1.1822) time: 4.1012 data: 0.0002 max mem: 54684 +[23:53:56.818198] Epoch: [0] [460/3229] lr: 0.000071 grad_norm: 0.8629 (1.0207) closs: 1.1466 (1.1812) time: 4.0980 data: 0.0002 max mem: 54684 +[23:54:36.960756] Epoch: [0] [470/3229] lr: 0.000073 grad_norm: 0.8488 (1.0179) closs: 1.1201 (1.1796) time: 4.0481 data: 0.0002 max mem: 54684 +[23:55:18.098024] Epoch: [0] [480/3229] lr: 0.000074 grad_norm: 0.8674 (1.0151) closs: 1.0820 (1.1774) time: 4.0639 data: 0.0002 max mem: 54684 +[23:55:58.929103] Epoch: [0] [490/3229] lr: 0.000076 grad_norm: 0.8972 (1.0126) closs: 1.0761 (1.1759) time: 4.0983 data: 0.0002 max mem: 54684 +[23:56:39.402425] Epoch: [0] [500/3229] lr: 0.000077 grad_norm: 0.8575 (1.0095) closs: 1.1150 (1.1745) time: 4.0651 data: 0.0002 max mem: 54684 +[23:57:20.334121] Epoch: [0] [510/3229] lr: 0.000079 grad_norm: 0.8890 (1.0078) closs: 1.1150 (1.1731) time: 4.0702 data: 0.0002 max mem: 54684 +[23:58:01.372021] Epoch: [0] [520/3229] lr: 0.000081 grad_norm: 0.8890 (1.0051) closs: 1.1070 (1.1710) time: 4.0984 data: 0.0002 max mem: 54684 +[23:58:42.144275] Epoch: [0] [530/3229] lr: 0.000082 grad_norm: 0.8428 (1.0018) closs: 1.1070 (1.1706) time: 4.0904 data: 0.0002 max mem: 54684 +[23:59:22.926638] Epoch: [0] [540/3229] lr: 0.000084 grad_norm: 0.8212 (0.9987) closs: 1.1551 (1.1702) time: 4.0777 data: 0.0002 max mem: 54684 +[00:00:03.149794] Epoch: [0] [550/3229] lr: 0.000085 grad_norm: 0.8337 (0.9958) closs: 1.1042 (1.1677) time: 4.0502 data: 0.0002 max mem: 54684 +[00:00:44.238609] Epoch: [0] [560/3229] lr: 0.000087 grad_norm: 0.8550 (0.9929) closs: 1.0702 (1.1667) time: 4.0655 data: 0.0002 max mem: 54684 +[00:01:24.695417] Epoch: [0] [570/3229] lr: 0.000088 grad_norm: 0.8658 (0.9915) closs: 1.1175 (1.1665) time: 4.0772 data: 0.0002 max mem: 54684 +[00:02:05.811689] Epoch: [0] [580/3229] lr: 0.000090 grad_norm: 0.8485 (0.9889) closs: 1.1632 (1.1661) time: 4.0786 data: 0.0002 max mem: 54684 +[00:02:46.823399] Epoch: [0] [590/3229] lr: 0.000091 grad_norm: 0.8317 (0.9862) closs: 1.1259 (1.1654) time: 4.1063 data: 0.0002 max mem: 54684 +[00:03:26.797835] Epoch: [0] [600/3229] lr: 0.000093 grad_norm: 0.7965 (0.9830) closs: 1.1026 (1.1643) time: 4.0492 data: 0.0002 max mem: 54684 +[00:04:07.579881] Epoch: [0] [610/3229] lr: 0.000094 grad_norm: 0.7848 (0.9798) closs: 1.1218 (1.1633) time: 4.0378 data: 0.0002 max mem: 54684 +[00:04:48.700597] Epoch: [0] [620/3229] lr: 0.000096 grad_norm: 0.8221 (0.9776) closs: 1.1324 (1.1630) time: 4.0951 data: 0.0002 max mem: 54684 +[00:05:29.642483] Epoch: [0] [630/3229] lr: 0.000098 grad_norm: 0.8110 (0.9744) closs: 1.1305 (1.1621) time: 4.1031 data: 0.0002 max mem: 54684 +[00:06:10.262537] Epoch: [0] [640/3229] lr: 0.000099 grad_norm: 0.8096 (0.9728) closs: 1.1009 (1.1604) time: 4.0780 data: 0.0002 max mem: 54684 +[00:06:50.715955] Epoch: [0] [650/3229] lr: 0.000100 grad_norm: 0.8278 (0.9702) closs: 1.0615 (1.1584) time: 4.0536 data: 0.0002 max mem: 54684 +[00:07:31.507808] Epoch: [0] [660/3229] lr: 0.000100 grad_norm: 0.7748 (0.9673) closs: 1.0615 (1.1577) time: 4.0622 data: 0.0002 max mem: 54684 +[00:08:12.189092] Epoch: [0] [670/3229] lr: 0.000100 grad_norm: 0.7953 (0.9656) closs: 1.1140 (1.1564) time: 4.0736 data: 0.0002 max mem: 54684 +[00:08:53.457197] Epoch: [0] [680/3229] lr: 0.000100 grad_norm: 0.8219 (0.9638) closs: 1.1346 (1.1563) time: 4.0974 data: 0.0002 max mem: 54684 +[00:09:34.230467] Epoch: [0] [690/3229] lr: 0.000100 grad_norm: 0.8043 (0.9615) closs: 1.1368 (1.1555) time: 4.1020 data: 0.0002 max mem: 54684 +[00:10:14.702838] Epoch: [0] [700/3229] lr: 0.000100 grad_norm: 0.7837 (0.9586) closs: 1.1245 (1.1542) time: 4.0622 data: 0.0002 max mem: 54684 +[00:10:55.922485] Epoch: [0] [710/3229] lr: 0.000100 grad_norm: 0.7837 (0.9568) closs: 1.1252 (1.1540) time: 4.0845 data: 0.0002 max mem: 54684 +[00:11:36.370423] Epoch: [0] [720/3229] lr: 0.000100 grad_norm: 0.7956 (0.9543) closs: 1.1058 (1.1526) time: 4.0833 data: 0.0002 max mem: 54684 +[00:12:17.473440] Epoch: [0] [730/3229] lr: 0.000100 grad_norm: 0.8050 (0.9522) closs: 1.0626 (1.1514) time: 4.0775 data: 0.0002 max mem: 54684 +[00:12:57.932552] Epoch: [0] [740/3229] lr: 0.000100 grad_norm: 0.7773 (0.9494) closs: 1.0728 (1.1505) time: 4.0780 data: 0.0002 max mem: 54684 +[00:13:39.080143] Epoch: [0] [750/3229] lr: 0.000100 grad_norm: 0.7764 (0.9473) closs: 1.0979 (1.1498) time: 4.0803 data: 0.0002 max mem: 54684 +[00:14:19.877475] Epoch: [0] [760/3229] lr: 0.000100 grad_norm: 0.7764 (0.9447) closs: 1.1119 (1.1494) time: 4.0972 data: 0.0002 max mem: 54684 +[00:15:00.672644] Epoch: [0] [770/3229] lr: 0.000100 grad_norm: 0.7562 (0.9422) closs: 1.1187 (1.1483) time: 4.0796 data: 0.0002 max mem: 54684 +[00:15:41.143591] Epoch: [0] [780/3229] lr: 0.000100 grad_norm: 0.7531 (0.9400) closs: 1.0727 (1.1473) time: 4.0632 data: 0.0002 max mem: 54684 +[00:16:21.920284] Epoch: [0] [790/3229] lr: 0.000100 grad_norm: 0.7644 (0.9379) closs: 1.0704 (1.1460) time: 4.0623 data: 0.0002 max mem: 54684 +[00:17:02.702494] Epoch: [0] [800/3229] lr: 0.000100 grad_norm: 0.7653 (0.9362) closs: 1.0833 (1.1455) time: 4.0779 data: 0.0002 max mem: 54684 +[00:17:43.158066] Epoch: [0] [810/3229] lr: 0.000100 grad_norm: 0.7914 (0.9341) closs: 1.0861 (1.1447) time: 4.0618 data: 0.0002 max mem: 54684 +[00:18:23.620826] Epoch: [0] [820/3229] lr: 0.000100 grad_norm: 0.7539 (0.9321) closs: 1.0861 (1.1438) time: 4.0459 data: 0.0002 max mem: 54684 +[00:19:04.787142] Epoch: [0] [830/3229] lr: 0.000100 grad_norm: 0.7176 (0.9296) closs: 1.0843 (1.1434) time: 4.0814 data: 0.0002 max mem: 54684 +[00:19:45.891691] Epoch: [0] [840/3229] lr: 0.000100 grad_norm: 0.7310 (0.9275) closs: 1.0950 (1.1429) time: 4.1135 data: 0.0002 max mem: 54684 +[00:20:26.345278] Epoch: [0] [850/3229] lr: 0.000100 grad_norm: 0.7597 (0.9258) closs: 1.0950 (1.1420) time: 4.0778 data: 0.0002 max mem: 54684 +[00:21:06.501382] Epoch: [0] [860/3229] lr: 0.000100 grad_norm: 0.7321 (0.9235) closs: 1.0948 (1.1410) time: 4.0304 data: 0.0002 max mem: 54684 +[00:21:48.059783] Epoch: [0] [870/3229] lr: 0.000100 grad_norm: 0.7269 (0.9213) closs: 1.0870 (1.1405) time: 4.0857 data: 0.0002 max mem: 54684 +[00:22:29.175074] Epoch: [0] [880/3229] lr: 0.000100 grad_norm: 0.7318 (0.9197) closs: 1.0882 (1.1403) time: 4.1336 data: 0.0002 max mem: 54684 +[00:23:10.331842] Epoch: [0] [890/3229] lr: 0.000100 grad_norm: 0.7552 (0.9178) closs: 1.1258 (1.1399) time: 4.1135 data: 0.0002 max mem: 54684 +[00:23:51.116622] Epoch: [0] [900/3229] lr: 0.000100 grad_norm: 0.7552 (0.9160) closs: 1.1166 (1.1395) time: 4.0970 data: 0.0002 max mem: 54684 +[00:24:32.291626] Epoch: [0] [910/3229] lr: 0.000100 grad_norm: 0.7255 (0.9140) closs: 1.1076 (1.1388) time: 4.0979 data: 0.0002 max mem: 54684 +[00:25:12.749474] Epoch: [0] [920/3229] lr: 0.000100 grad_norm: 0.7110 (0.9118) closs: 1.0895 (1.1382) time: 4.0816 data: 0.0002 max mem: 54684 +[00:25:53.206260] Epoch: [0] [930/3229] lr: 0.000100 grad_norm: 0.7317 (0.9100) closs: 1.1072 (1.1377) time: 4.0457 data: 0.0002 max mem: 54684 +[00:26:34.341822] Epoch: [0] [940/3229] lr: 0.000100 grad_norm: 0.7409 (0.9084) closs: 1.1083 (1.1372) time: 4.0796 data: 0.0002 max mem: 54684 +[00:27:15.438692] Epoch: [0] [950/3229] lr: 0.000100 grad_norm: 0.7488 (0.9067) closs: 1.0973 (1.1366) time: 4.1116 data: 0.0002 max mem: 54684 +[00:27:56.584356] Epoch: [0] [960/3229] lr: 0.000100 grad_norm: 0.7426 (0.9057) closs: 1.0873 (1.1361) time: 4.1121 data: 0.0002 max mem: 54684 +[00:28:37.382221] Epoch: [0] [970/3229] lr: 0.000100 grad_norm: 0.7408 (0.9040) closs: 1.0993 (1.1355) time: 4.0971 data: 0.0002 max mem: 54684 +[00:29:17.838797] Epoch: [0] [980/3229] lr: 0.000100 grad_norm: 0.7385 (0.9023) closs: 1.1009 (1.1351) time: 4.0627 data: 0.0002 max mem: 54684 +[00:29:59.329393] Epoch: [0] [990/3229] lr: 0.000100 grad_norm: 0.7272 (0.9007) closs: 1.1175 (1.1350) time: 4.0973 data: 0.0002 max mem: 54684 +[00:30:39.787315] Epoch: [0] [1000/3229] lr: 0.000100 grad_norm: 0.7447 (0.8993) closs: 1.1062 (1.1341) time: 4.0974 data: 0.0002 max mem: 54684 +[00:31:20.577977] Epoch: [0] [1010/3229] lr: 0.000100 grad_norm: 0.7520 (0.8976) closs: 1.0464 (1.1330) time: 4.0624 data: 0.0002 max mem: 54684 +[00:32:01.381502] Epoch: [0] [1020/3229] lr: 0.000100 grad_norm: 0.7217 (0.8957) closs: 1.0770 (1.1327) time: 4.0796 data: 0.0002 max mem: 54684 +[00:32:42.943123] Epoch: [0] [1030/3229] lr: 0.000100 grad_norm: 0.7217 (0.8942) closs: 1.1380 (1.1326) time: 4.1182 data: 0.0002 max mem: 54684 +[00:33:23.714134] Epoch: [0] [1040/3229] lr: 0.000100 grad_norm: 0.7318 (0.8925) closs: 1.0825 (1.1320) time: 4.1166 data: 0.0002 max mem: 54684 +[00:34:04.838184] Epoch: [0] [1050/3229] lr: 0.000100 grad_norm: 0.7374 (0.8912) closs: 1.0988 (1.1318) time: 4.0947 data: 0.0002 max mem: 54684 +[00:34:46.091225] Epoch: [0] [1060/3229] lr: 0.000100 grad_norm: 0.7631 (0.8897) closs: 1.1092 (1.1316) time: 4.1188 data: 0.0002 max mem: 54684 +[00:35:26.212506] Epoch: [0] [1070/3229] lr: 0.000099 grad_norm: 0.7038 (0.8877) closs: 1.0837 (1.1307) time: 4.0686 data: 0.0002 max mem: 54684 +[00:36:07.340784] Epoch: [0] [1080/3229] lr: 0.000099 grad_norm: 0.7197 (0.8864) closs: 1.0947 (1.1307) time: 4.0624 data: 0.0002 max mem: 54684 +[00:36:48.165191] Epoch: [0] [1090/3229] lr: 0.000099 grad_norm: 0.7339 (0.8849) closs: 1.1309 (1.1305) time: 4.0976 data: 0.0002 max mem: 54684 +[00:37:28.183396] Epoch: [0] [1100/3229] lr: 0.000099 grad_norm: 0.7004 (0.8833) closs: 1.0832 (1.1297) time: 4.0421 data: 0.0002 max mem: 54684 +[00:38:08.517357] Epoch: [0] [1110/3229] lr: 0.000099 grad_norm: 0.7150 (0.8816) closs: 1.0764 (1.1290) time: 4.0175 data: 0.0002 max mem: 54684 +[00:38:49.323103] Epoch: [0] [1120/3229] lr: 0.000099 grad_norm: 0.7361 (0.8804) closs: 1.0986 (1.1287) time: 4.0569 data: 0.0002 max mem: 54684 +[00:39:30.123571] Epoch: [0] [1130/3229] lr: 0.000099 grad_norm: 0.7361 (0.8792) closs: 1.0772 (1.1279) time: 4.0802 data: 0.0002 max mem: 54684 +[00:40:10.803257] Epoch: [0] [1140/3229] lr: 0.000099 grad_norm: 0.7173 (0.8776) closs: 1.0506 (1.1273) time: 4.0739 data: 0.0002 max mem: 54684 +[00:40:51.438742] Epoch: [0] [1150/3229] lr: 0.000099 grad_norm: 0.6859 (0.8758) closs: 1.0506 (1.1263) time: 4.0657 data: 0.0002 max mem: 54684 +[00:41:31.922675] Epoch: [0] [1160/3229] lr: 0.000099 grad_norm: 0.6937 (0.8745) closs: 1.0536 (1.1256) time: 4.0559 data: 0.0002 max mem: 54684 +[00:42:12.389017] Epoch: [0] [1170/3229] lr: 0.000099 grad_norm: 0.7231 (0.8731) closs: 1.0545 (1.1245) time: 4.0474 data: 0.0002 max mem: 54684 +[00:42:52.804008] Epoch: [0] [1180/3229] lr: 0.000099 grad_norm: 0.7039 (0.8716) closs: 1.0439 (1.1238) time: 4.0440 data: 0.0002 max mem: 54684 +[00:43:33.442370] Epoch: [0] [1190/3229] lr: 0.000099 grad_norm: 0.7185 (0.8703) closs: 1.0626 (1.1232) time: 4.0526 data: 0.0002 max mem: 54684 +[00:44:14.248252] Epoch: [0] [1200/3229] lr: 0.000099 grad_norm: 0.7064 (0.8688) closs: 1.0911 (1.1229) time: 4.0721 data: 0.0002 max mem: 54684 +[00:44:54.730599] Epoch: [0] [1210/3229] lr: 0.000099 grad_norm: 0.6838 (0.8671) closs: 1.0769 (1.1222) time: 4.0643 data: 0.0002 max mem: 54684 +[00:45:35.439411] Epoch: [0] [1220/3229] lr: 0.000099 grad_norm: 0.6745 (0.8657) closs: 1.0487 (1.1217) time: 4.0595 data: 0.0002 max mem: 54684 +[00:46:16.740734] Epoch: [0] [1230/3229] lr: 0.000099 grad_norm: 0.6783 (0.8642) closs: 1.1197 (1.1218) time: 4.1004 data: 0.0002 max mem: 54684 +[00:46:57.546801] Epoch: [0] [1240/3229] lr: 0.000099 grad_norm: 0.7019 (0.8631) closs: 1.1150 (1.1215) time: 4.1053 data: 0.0002 max mem: 54684 +[00:47:38.021108] Epoch: [0] [1250/3229] lr: 0.000099 grad_norm: 0.7065 (0.8617) closs: 1.0681 (1.1212) time: 4.0640 data: 0.0002 max mem: 54684 +[00:48:18.636920] Epoch: [0] [1260/3229] lr: 0.000099 grad_norm: 0.6828 (0.8605) closs: 1.0594 (1.1205) time: 4.0544 data: 0.0002 max mem: 54684 +[00:48:59.298551] Epoch: [0] [1270/3229] lr: 0.000099 grad_norm: 0.6828 (0.8593) closs: 1.0436 (1.1198) time: 4.0638 data: 0.0002 max mem: 54684 +[00:49:40.435891] Epoch: [0] [1280/3229] lr: 0.000099 grad_norm: 0.6738 (0.8580) closs: 1.0650 (1.1194) time: 4.0899 data: 0.0002 max mem: 54684 +[00:50:20.609485] Epoch: [0] [1290/3229] lr: 0.000099 grad_norm: 0.6694 (0.8565) closs: 1.0689 (1.1191) time: 4.0655 data: 0.0002 max mem: 54684 +[00:51:00.959404] Epoch: [0] [1300/3229] lr: 0.000099 grad_norm: 0.6727 (0.8552) closs: 1.0693 (1.1186) time: 4.0261 data: 0.0002 max mem: 54684 +[00:51:41.919049] Epoch: [0] [1310/3229] lr: 0.000099 grad_norm: 0.6736 (0.8538) closs: 1.0793 (1.1182) time: 4.0654 data: 0.0002 max mem: 54684 +[00:52:22.383012] Epoch: [0] [1320/3229] lr: 0.000099 grad_norm: 0.6669 (0.8526) closs: 1.0662 (1.1176) time: 4.0711 data: 0.0002 max mem: 54684 +[00:53:03.188621] Epoch: [0] [1330/3229] lr: 0.000099 grad_norm: 0.6665 (0.8512) closs: 1.0662 (1.1174) time: 4.0634 data: 0.0002 max mem: 54684 +[00:53:44.567629] Epoch: [0] [1340/3229] lr: 0.000099 grad_norm: 0.6947 (0.8502) closs: 1.1050 (1.1174) time: 4.1092 data: 0.0002 max mem: 54684 +[00:54:25.195105] Epoch: [0] [1350/3229] lr: 0.000099 grad_norm: 0.7112 (0.8491) closs: 1.0731 (1.1170) time: 4.1003 data: 0.0002 max mem: 54684 +[00:55:05.687079] Epoch: [0] [1360/3229] lr: 0.000099 grad_norm: 0.6784 (0.8480) closs: 1.0562 (1.1165) time: 4.0559 data: 0.0002 max mem: 54684 +[00:55:46.805581] Epoch: [0] [1370/3229] lr: 0.000099 grad_norm: 0.6794 (0.8470) closs: 1.0625 (1.1162) time: 4.0805 data: 0.0002 max mem: 54684 +[00:56:28.138612] Epoch: [0] [1380/3229] lr: 0.000098 grad_norm: 0.6794 (0.8460) closs: 1.0625 (1.1159) time: 4.1225 data: 0.0002 max mem: 54684 +[00:57:08.790631] Epoch: [0] [1390/3229] lr: 0.000098 grad_norm: 0.6731 (0.8447) closs: 1.0694 (1.1155) time: 4.0992 data: 0.0002 max mem: 54684 +[00:57:49.591828] Epoch: [0] [1400/3229] lr: 0.000098 grad_norm: 0.6726 (0.8434) closs: 1.0810 (1.1152) time: 4.0726 data: 0.0002 max mem: 54684 +[00:58:30.383532] Epoch: [0] [1410/3229] lr: 0.000098 grad_norm: 0.6579 (0.8422) closs: 1.0879 (1.1149) time: 4.0796 data: 0.0002 max mem: 54684 +[00:59:11.318426] Epoch: [0] [1420/3229] lr: 0.000098 grad_norm: 0.6579 (0.8409) closs: 1.0965 (1.1149) time: 4.0863 data: 0.0002 max mem: 54684 +[00:59:51.955092] Epoch: [0] [1430/3229] lr: 0.000098 grad_norm: 0.6626 (0.8394) closs: 1.0851 (1.1145) time: 4.0785 data: 0.0002 max mem: 54684 +[01:00:32.756437] Epoch: [0] [1440/3229] lr: 0.000098 grad_norm: 0.6626 (0.8383) closs: 1.0851 (1.1144) time: 4.0718 data: 0.0002 max mem: 54684 +[01:01:13.548959] Epoch: [0] [1450/3229] lr: 0.000098 grad_norm: 0.6736 (0.8370) closs: 1.1211 (1.1145) time: 4.0796 data: 0.0002 max mem: 54684 +[01:01:54.540506] Epoch: [0] [1460/3229] lr: 0.000098 grad_norm: 0.6815 (0.8361) closs: 1.1259 (1.1144) time: 4.0891 data: 0.0002 max mem: 54684 +[01:02:35.496041] Epoch: [0] [1470/3229] lr: 0.000098 grad_norm: 0.7130 (0.8352) closs: 1.0750 (1.1140) time: 4.0973 data: 0.0002 max mem: 54684 +[01:03:16.615434] Epoch: [0] [1480/3229] lr: 0.000098 grad_norm: 0.6600 (0.8339) closs: 1.0744 (1.1137) time: 4.1037 data: 0.0002 max mem: 54684 +[01:03:57.734129] Epoch: [0] [1490/3229] lr: 0.000098 grad_norm: 0.6600 (0.8330) closs: 1.0960 (1.1136) time: 4.1118 data: 0.0002 max mem: 54684 +[01:04:39.265592] Epoch: [0] [1500/3229] lr: 0.000098 grad_norm: 0.6993 (0.8322) closs: 1.0960 (1.1136) time: 4.1324 data: 0.0002 max mem: 54684 +[01:05:20.047181] Epoch: [0] [1510/3229] lr: 0.000098 grad_norm: 0.6993 (0.8313) closs: 1.0891 (1.1133) time: 4.1156 data: 0.0002 max mem: 54684 +[01:06:01.147748] Epoch: [0] [1520/3229] lr: 0.000098 grad_norm: 0.6925 (0.8304) closs: 1.0764 (1.1131) time: 4.0940 data: 0.0002 max mem: 54684 +[01:06:42.349323] Epoch: [0] [1530/3229] lr: 0.000098 grad_norm: 0.6751 (0.8295) closs: 1.0807 (1.1129) time: 4.1150 data: 0.0002 max mem: 54684 +[01:07:23.760475] Epoch: [0] [1540/3229] lr: 0.000098 grad_norm: 0.6925 (0.8287) closs: 1.0949 (1.1126) time: 4.1306 data: 0.0002 max mem: 54684 +[01:08:03.939766] Epoch: [0] [1550/3229] lr: 0.000098 grad_norm: 0.6828 (0.8275) closs: 1.0497 (1.1118) time: 4.0795 data: 0.0002 max mem: 54684 +[01:08:44.774063] Epoch: [0] [1560/3229] lr: 0.000098 grad_norm: 0.6374 (0.8264) closs: 1.0350 (1.1114) time: 4.0506 data: 0.0002 max mem: 54684 +[01:09:25.978765] Epoch: [0] [1570/3229] lr: 0.000098 grad_norm: 0.6246 (0.8253) closs: 1.0372 (1.1111) time: 4.1019 data: 0.0002 max mem: 54684 +[01:10:06.789325] Epoch: [0] [1580/3229] lr: 0.000098 grad_norm: 0.6425 (0.8243) closs: 1.0611 (1.1106) time: 4.1007 data: 0.0002 max mem: 54684 +[01:10:47.602967] Epoch: [0] [1590/3229] lr: 0.000097 grad_norm: 0.6555 (0.8233) closs: 1.0611 (1.1102) time: 4.0811 data: 0.0002 max mem: 54684 +[01:11:27.430855] Epoch: [0] [1600/3229] lr: 0.000097 grad_norm: 0.6335 (0.8220) closs: 1.0391 (1.1095) time: 4.0320 data: 0.0002 max mem: 54684 +[01:12:08.472840] Epoch: [0] [1610/3229] lr: 0.000097 grad_norm: 0.6397 (0.8210) closs: 1.0789 (1.1094) time: 4.0434 data: 0.0002 max mem: 54684 +[01:12:49.720690] Epoch: [0] [1620/3229] lr: 0.000097 grad_norm: 0.6731 (0.8203) closs: 1.1052 (1.1095) time: 4.1144 data: 0.0002 max mem: 54684 +[01:13:30.798702] Epoch: [0] [1630/3229] lr: 0.000097 grad_norm: 0.6657 (0.8194) closs: 1.0879 (1.1092) time: 4.1162 data: 0.0002 max mem: 54684 +[01:14:11.875270] Epoch: [0] [1640/3229] lr: 0.000097 grad_norm: 0.6634 (0.8185) closs: 1.0959 (1.1093) time: 4.1077 data: 0.0002 max mem: 54684 +[01:14:52.887302] Epoch: [0] [1650/3229] lr: 0.000097 grad_norm: 0.6733 (0.8176) closs: 1.0812 (1.1090) time: 4.1044 data: 0.0002 max mem: 54684 +[01:15:33.822367] Epoch: [0] [1660/3229] lr: 0.000097 grad_norm: 0.6436 (0.8166) closs: 1.0691 (1.1089) time: 4.0973 data: 0.0002 max mem: 54684 +[01:16:14.261803] Epoch: [0] [1670/3229] lr: 0.000097 grad_norm: 0.6324 (0.8155) closs: 1.0902 (1.1088) time: 4.0687 data: 0.0002 max mem: 54684 +[01:16:55.379849] Epoch: [0] [1680/3229] lr: 0.000097 grad_norm: 0.6285 (0.8143) closs: 1.0896 (1.1088) time: 4.0778 data: 0.0002 max mem: 54684 +[01:17:35.740413] Epoch: [0] [1690/3229] lr: 0.000097 grad_norm: 0.6079 (0.8132) closs: 1.0718 (1.1084) time: 4.0739 data: 0.0002 max mem: 54684 +[01:18:16.750968] Epoch: [0] [1700/3229] lr: 0.000097 grad_norm: 0.6269 (0.8122) closs: 1.0566 (1.1082) time: 4.0685 data: 0.0002 max mem: 54684 +[01:18:56.907298] Epoch: [0] [1710/3229] lr: 0.000097 grad_norm: 0.6192 (0.8112) closs: 1.0566 (1.1077) time: 4.0583 data: 0.0002 max mem: 54684 +[01:19:37.360724] Epoch: [0] [1720/3229] lr: 0.000097 grad_norm: 0.6271 (0.8102) closs: 1.0554 (1.1074) time: 4.0304 data: 0.0002 max mem: 54684 +[01:20:18.619365] Epoch: [0] [1730/3229] lr: 0.000097 grad_norm: 0.6375 (0.8094) closs: 1.0484 (1.1072) time: 4.0855 data: 0.0002 max mem: 54684 +[01:20:59.595694] Epoch: [0] [1740/3229] lr: 0.000097 grad_norm: 0.6375 (0.8083) closs: 1.0680 (1.1071) time: 4.1117 data: 0.0002 max mem: 54684 +[01:21:40.383403] Epoch: [0] [1750/3229] lr: 0.000097 grad_norm: 0.6502 (0.8075) closs: 1.0761 (1.1069) time: 4.0881 data: 0.0002 max mem: 54684 +[01:22:21.505953] Epoch: [0] [1760/3229] lr: 0.000096 grad_norm: 0.6573 (0.8066) closs: 1.1000 (1.1068) time: 4.0954 data: 0.0002 max mem: 54684 +[01:23:02.622161] Epoch: [0] [1770/3229] lr: 0.000096 grad_norm: 0.6351 (0.8057) closs: 1.0783 (1.1064) time: 4.1119 data: 0.0002 max mem: 54684 +[01:23:43.593447] Epoch: [0] [1780/3229] lr: 0.000096 grad_norm: 0.6351 (0.8047) closs: 1.0777 (1.1061) time: 4.1043 data: 0.0002 max mem: 54684 +[01:24:24.461912] Epoch: [0] [1790/3229] lr: 0.000096 grad_norm: 0.6306 (0.8036) closs: 1.0259 (1.1058) time: 4.0919 data: 0.0002 max mem: 54684 +[01:25:05.008923] Epoch: [0] [1800/3229] lr: 0.000096 grad_norm: 0.6409 (0.8027) closs: 1.0266 (1.1055) time: 4.0707 data: 0.0002 max mem: 54684 +[01:25:46.132901] Epoch: [0] [1810/3229] lr: 0.000096 grad_norm: 0.6404 (0.8017) closs: 1.0714 (1.1053) time: 4.0835 data: 0.0002 max mem: 54684 +[01:26:26.759139] Epoch: [0] [1820/3229] lr: 0.000096 grad_norm: 0.6292 (0.8007) closs: 1.0945 (1.1052) time: 4.0874 data: 0.0002 max mem: 54684 +[01:27:07.555099] Epoch: [0] [1830/3229] lr: 0.000096 grad_norm: 0.6392 (0.7998) closs: 1.0834 (1.1051) time: 4.0710 data: 0.0002 max mem: 54684 +[01:27:47.364766] Epoch: [0] [1840/3229] lr: 0.000096 grad_norm: 0.5939 (0.7986) closs: 1.0471 (1.1047) time: 4.0302 data: 0.0002 max mem: 54684 +[01:28:28.031123] Epoch: [0] [1850/3229] lr: 0.000096 grad_norm: 0.6241 (0.7977) closs: 1.0452 (1.1044) time: 4.0237 data: 0.0002 max mem: 54684 +[01:29:08.346102] Epoch: [0] [1860/3229] lr: 0.000096 grad_norm: 0.6386 (0.7967) closs: 1.0565 (1.1041) time: 4.0490 data: 0.0002 max mem: 54684 +[01:29:49.454480] Epoch: [0] [1870/3229] lr: 0.000096 grad_norm: 0.6596 (0.7961) closs: 1.0934 (1.1040) time: 4.0711 data: 0.0002 max mem: 54684 +[01:30:30.259488] Epoch: [0] [1880/3229] lr: 0.000096 grad_norm: 0.6550 (0.7952) closs: 1.0679 (1.1036) time: 4.0956 data: 0.0002 max mem: 54684 +[01:31:11.247510] Epoch: [0] [1890/3229] lr: 0.000096 grad_norm: 0.6164 (0.7943) closs: 1.0539 (1.1034) time: 4.0896 data: 0.0002 max mem: 54684 +[01:31:52.539214] Epoch: [0] [1900/3229] lr: 0.000096 grad_norm: 0.6490 (0.7938) closs: 1.0805 (1.1033) time: 4.1139 data: 0.0002 max mem: 54684 +[01:32:33.012926] Epoch: [0] [1910/3229] lr: 0.000095 grad_norm: 0.6397 (0.7929) closs: 1.0446 (1.1028) time: 4.0882 data: 0.0002 max mem: 54684 +[01:33:13.158983] Epoch: [0] [1920/3229] lr: 0.000095 grad_norm: 0.6288 (0.7920) closs: 1.0213 (1.1023) time: 4.0309 data: 0.0002 max mem: 54684 +[01:33:54.586231] Epoch: [0] [1930/3229] lr: 0.000095 grad_norm: 0.6317 (0.7912) closs: 1.0280 (1.1022) time: 4.0786 data: 0.0002 max mem: 54684 +[01:34:35.894563] Epoch: [0] [1940/3229] lr: 0.000095 grad_norm: 0.6358 (0.7905) closs: 1.1057 (1.1023) time: 4.1367 data: 0.0002 max mem: 54684 +[01:35:17.015653] Epoch: [0] [1950/3229] lr: 0.000095 grad_norm: 0.6502 (0.7900) closs: 1.0936 (1.1022) time: 4.1214 data: 0.0002 max mem: 54684 +[01:35:57.478499] Epoch: [0] [1960/3229] lr: 0.000095 grad_norm: 0.6502 (0.7891) closs: 1.0743 (1.1020) time: 4.0791 data: 0.0002 max mem: 54684 +[01:36:38.315306] Epoch: [0] [1970/3229] lr: 0.000095 grad_norm: 0.6454 (0.7885) closs: 1.0743 (1.1019) time: 4.0649 data: 0.0002 max mem: 54684 +[01:37:18.937387] Epoch: [0] [1980/3229] lr: 0.000095 grad_norm: 0.6482 (0.7877) closs: 1.0680 (1.1017) time: 4.0729 data: 0.0002 max mem: 54684 +[01:38:00.087656] Epoch: [0] [1990/3229] lr: 0.000095 grad_norm: 0.6580 (0.7872) closs: 1.0680 (1.1015) time: 4.0885 data: 0.0002 max mem: 54684 +[01:38:40.255048] Epoch: [0] [2000/3229] lr: 0.000095 grad_norm: 0.6547 (0.7863) closs: 1.0624 (1.1011) time: 4.0658 data: 0.0002 max mem: 54684 +[01:39:22.009369] Epoch: [0] [2010/3229] lr: 0.000095 grad_norm: 0.6437 (0.7856) closs: 1.0624 (1.1009) time: 4.0960 data: 0.0002 max mem: 54684 +[01:40:02.651703] Epoch: [0] [2020/3229] lr: 0.000095 grad_norm: 0.6437 (0.7849) closs: 1.0730 (1.1008) time: 4.1198 data: 0.0002 max mem: 54684 +[01:40:42.813224] Epoch: [0] [2030/3229] lr: 0.000095 grad_norm: 0.6359 (0.7840) closs: 1.0540 (1.1003) time: 4.0401 data: 0.0002 max mem: 54684 +[01:41:23.609220] Epoch: [0] [2040/3229] lr: 0.000095 grad_norm: 0.6075 (0.7831) closs: 1.0542 (1.1001) time: 4.0478 data: 0.0002 max mem: 54684 +[01:42:04.985880] Epoch: [0] [2050/3229] lr: 0.000094 grad_norm: 0.6034 (0.7823) closs: 1.0908 (1.1002) time: 4.1086 data: 0.0002 max mem: 54684 +[01:42:45.962473] Epoch: [0] [2060/3229] lr: 0.000094 grad_norm: 0.6032 (0.7815) closs: 1.1014 (1.1001) time: 4.1176 data: 0.0002 max mem: 54684 +[01:43:26.108426] Epoch: [0] [2070/3229] lr: 0.000094 grad_norm: 0.6324 (0.7809) closs: 1.0512 (1.0997) time: 4.0561 data: 0.0002 max mem: 54684 +[01:44:07.013619] Epoch: [0] [2080/3229] lr: 0.000094 grad_norm: 0.6366 (0.7801) closs: 1.0372 (1.0995) time: 4.0525 data: 0.0002 max mem: 54684 +[01:44:48.024718] Epoch: [0] [2090/3229] lr: 0.000094 grad_norm: 0.6275 (0.7794) closs: 1.0783 (1.0992) time: 4.0957 data: 0.0002 max mem: 54684 +[01:45:28.323605] Epoch: [0] [2100/3229] lr: 0.000094 grad_norm: 0.6155 (0.7786) closs: 1.0706 (1.0991) time: 4.0654 data: 0.0002 max mem: 54684 +[01:46:09.119391] Epoch: [0] [2110/3229] lr: 0.000094 grad_norm: 0.6284 (0.7779) closs: 1.0700 (1.0989) time: 4.0547 data: 0.0002 max mem: 54684 +[01:46:49.684021] Epoch: [0] [2120/3229] lr: 0.000094 grad_norm: 0.6332 (0.7772) closs: 1.0607 (1.0986) time: 4.0680 data: 0.0002 max mem: 54684 +[01:47:30.511414] Epoch: [0] [2130/3229] lr: 0.000094 grad_norm: 0.6332 (0.7764) closs: 1.0698 (1.0986) time: 4.0695 data: 0.0002 max mem: 54684 +[01:48:11.156731] Epoch: [0] [2140/3229] lr: 0.000094 grad_norm: 0.6349 (0.7757) closs: 1.0916 (1.0985) time: 4.0736 data: 0.0002 max mem: 54684 +[01:48:51.920791] Epoch: [0] [2150/3229] lr: 0.000094 grad_norm: 0.6271 (0.7751) closs: 1.0552 (1.0983) time: 4.0704 data: 0.0002 max mem: 54684 +[01:49:32.907279] Epoch: [0] [2160/3229] lr: 0.000094 grad_norm: 0.6144 (0.7744) closs: 1.0773 (1.0983) time: 4.0875 data: 0.0002 max mem: 54684 +[01:50:14.443893] Epoch: [0] [2170/3229] lr: 0.000093 grad_norm: 0.6341 (0.7737) closs: 1.1436 (1.0986) time: 4.1261 data: 0.0002 max mem: 54684 +[01:50:55.743917] Epoch: [0] [2180/3229] lr: 0.000093 grad_norm: 0.6366 (0.7730) closs: 1.0898 (1.0985) time: 4.1418 data: 0.0002 max mem: 54684 +[01:51:36.202814] Epoch: [0] [2190/3229] lr: 0.000093 grad_norm: 0.6366 (0.7725) closs: 1.0797 (1.0984) time: 4.0879 data: 0.0002 max mem: 54684 +[01:52:17.113173] Epoch: [0] [2200/3229] lr: 0.000093 grad_norm: 0.6388 (0.7718) closs: 1.0543 (1.0983) time: 4.0684 data: 0.0002 max mem: 54684 +[01:52:58.036227] Epoch: [0] [2210/3229] lr: 0.000093 grad_norm: 0.6292 (0.7711) closs: 1.0506 (1.0982) time: 4.0916 data: 0.0002 max mem: 54684 +[01:53:38.979227] Epoch: [0] [2220/3229] lr: 0.000093 grad_norm: 0.6303 (0.7705) closs: 1.0667 (1.0979) time: 4.0932 data: 0.0002 max mem: 54684 +[01:54:19.782478] Epoch: [0] [2230/3229] lr: 0.000093 grad_norm: 0.6258 (0.7698) closs: 1.0397 (1.0977) time: 4.0872 data: 0.0002 max mem: 54684 +[01:55:00.467224] Epoch: [0] [2240/3229] lr: 0.000093 grad_norm: 0.6217 (0.7693) closs: 1.0617 (1.0974) time: 4.0743 data: 0.0002 max mem: 54684 +[01:55:41.351470] Epoch: [0] [2250/3229] lr: 0.000093 grad_norm: 0.6183 (0.7686) closs: 1.0617 (1.0973) time: 4.0784 data: 0.0002 max mem: 54684 +[01:56:22.358347] Epoch: [0] [2260/3229] lr: 0.000093 grad_norm: 0.6090 (0.7679) closs: 1.0609 (1.0971) time: 4.0945 data: 0.0002 max mem: 54684 +[01:57:03.481704] Epoch: [0] [2270/3229] lr: 0.000093 grad_norm: 0.6140 (0.7673) closs: 1.0597 (1.0969) time: 4.1064 data: 0.0002 max mem: 54684 +[01:57:44.512674] Epoch: [0] [2280/3229] lr: 0.000093 grad_norm: 0.6077 (0.7666) closs: 1.0421 (1.0968) time: 4.1076 data: 0.0002 max mem: 54684 +[01:58:25.617467] Epoch: [0] [2290/3229] lr: 0.000092 grad_norm: 0.5894 (0.7659) closs: 1.0421 (1.0967) time: 4.1067 data: 0.0002 max mem: 54684 +[01:59:06.748646] Epoch: [0] [2300/3229] lr: 0.000092 grad_norm: 0.6177 (0.7654) closs: 1.1147 (1.0967) time: 4.1117 data: 0.0002 max mem: 54684 +[01:59:46.929884] Epoch: [0] [2310/3229] lr: 0.000092 grad_norm: 0.6242 (0.7647) closs: 1.0786 (1.0965) time: 4.0656 data: 0.0002 max mem: 54684 +[02:00:27.941038] Epoch: [0] [2320/3229] lr: 0.000092 grad_norm: 0.6335 (0.7641) closs: 1.0572 (1.0963) time: 4.0596 data: 0.0002 max mem: 54684 +[02:01:09.062412] Epoch: [0] [2330/3229] lr: 0.000092 grad_norm: 0.5965 (0.7634) closs: 1.0347 (1.0959) time: 4.1066 data: 0.0002 max mem: 54684 +[02:01:49.850718] Epoch: [0] [2340/3229] lr: 0.000092 grad_norm: 0.5994 (0.7628) closs: 1.0367 (1.0959) time: 4.0954 data: 0.0002 max mem: 54684 +[02:02:30.342915] Epoch: [0] [2350/3229] lr: 0.000092 grad_norm: 0.6362 (0.7621) closs: 1.0736 (1.0957) time: 4.0640 data: 0.0002 max mem: 54684 +[02:03:11.287984] Epoch: [0] [2360/3229] lr: 0.000092 grad_norm: 0.6227 (0.7616) closs: 1.0493 (1.0956) time: 4.0718 data: 0.0002 max mem: 54684 +[02:03:52.343325] Epoch: [0] [2370/3229] lr: 0.000092 grad_norm: 0.6227 (0.7610) closs: 1.0744 (1.0957) time: 4.0999 data: 0.0002 max mem: 54684 +[02:04:33.448316] Epoch: [0] [2380/3229] lr: 0.000092 grad_norm: 0.5984 (0.7603) closs: 1.1016 (1.0957) time: 4.1079 data: 0.0002 max mem: 54684 +[02:05:13.893563] Epoch: [0] [2390/3229] lr: 0.000092 grad_norm: 0.5984 (0.7597) closs: 1.0778 (1.0955) time: 4.0774 data: 0.0002 max mem: 54684 +[02:05:54.988858] Epoch: [0] [2400/3229] lr: 0.000091 grad_norm: 0.6343 (0.7591) closs: 1.0643 (1.0953) time: 4.0770 data: 0.0002 max mem: 54684 +[02:06:36.309038] Epoch: [0] [2410/3229] lr: 0.000091 grad_norm: 0.6401 (0.7586) closs: 1.0573 (1.0952) time: 4.1207 data: 0.0002 max mem: 54684 +[02:07:17.084821] Epoch: [0] [2420/3229] lr: 0.000091 grad_norm: 0.6369 (0.7581) closs: 1.0577 (1.0950) time: 4.1047 data: 0.0002 max mem: 54684 +[02:07:57.540598] Epoch: [0] [2430/3229] lr: 0.000091 grad_norm: 0.6204 (0.7575) closs: 1.0577 (1.0947) time: 4.0615 data: 0.0002 max mem: 54684 +[02:08:39.007315] Epoch: [0] [2440/3229] lr: 0.000091 grad_norm: 0.6199 (0.7569) closs: 1.0658 (1.0947) time: 4.0961 data: 0.0002 max mem: 54684 +[02:09:20.629948] Epoch: [0] [2450/3229] lr: 0.000091 grad_norm: 0.6166 (0.7564) closs: 1.1124 (1.0947) time: 4.1544 data: 0.0002 max mem: 54684 +[02:10:01.711671] Epoch: [0] [2460/3229] lr: 0.000091 grad_norm: 0.6152 (0.7559) closs: 1.0793 (1.0946) time: 4.1352 data: 0.0002 max mem: 54684 +[02:10:42.828111] Epoch: [0] [2470/3229] lr: 0.000091 grad_norm: 0.6222 (0.7554) closs: 1.0420 (1.0943) time: 4.1098 data: 0.0002 max mem: 54684 +[02:11:23.897608] Epoch: [0] [2480/3229] lr: 0.000091 grad_norm: 0.6073 (0.7548) closs: 1.0518 (1.0941) time: 4.1092 data: 0.0002 max mem: 54684 +[02:12:05.274949] Epoch: [0] [2490/3229] lr: 0.000091 grad_norm: 0.6073 (0.7543) closs: 1.0648 (1.0941) time: 4.1223 data: 0.0002 max mem: 54684 +[02:12:46.406795] Epoch: [0] [2500/3229] lr: 0.000090 grad_norm: 0.6085 (0.7537) closs: 1.1052 (1.0941) time: 4.1254 data: 0.0002 max mem: 54684 +[02:13:26.863288] Epoch: [0] [2510/3229] lr: 0.000090 grad_norm: 0.5947 (0.7531) closs: 1.0760 (1.0939) time: 4.0794 data: 0.0002 max mem: 54684 +[02:14:08.186519] Epoch: [0] [2520/3229] lr: 0.000090 grad_norm: 0.5880 (0.7526) closs: 1.0614 (1.0938) time: 4.0889 data: 0.0002 max mem: 54684 +[02:14:49.191285] Epoch: [0] [2530/3229] lr: 0.000090 grad_norm: 0.6019 (0.7520) closs: 1.0730 (1.0938) time: 4.1163 data: 0.0002 max mem: 54684 +[02:15:29.975783] Epoch: [0] [2540/3229] lr: 0.000090 grad_norm: 0.5906 (0.7514) closs: 1.0601 (1.0936) time: 4.0894 data: 0.0002 max mem: 54684 +[02:16:10.768922] Epoch: [0] [2550/3229] lr: 0.000090 grad_norm: 0.5906 (0.7508) closs: 1.0423 (1.0934) time: 4.0788 data: 0.0002 max mem: 54684 +[02:16:52.174050] Epoch: [0] [2560/3229] lr: 0.000090 grad_norm: 0.5961 (0.7503) closs: 1.0423 (1.0933) time: 4.1098 data: 0.0002 max mem: 54684 +[02:17:33.116524] Epoch: [0] [2570/3229] lr: 0.000090 grad_norm: 0.5933 (0.7497) closs: 1.0581 (1.0932) time: 4.1173 data: 0.0002 max mem: 54684 +[02:18:14.196101] Epoch: [0] [2580/3229] lr: 0.000090 grad_norm: 0.5860 (0.7491) closs: 1.0619 (1.0931) time: 4.1010 data: 0.0002 max mem: 54684 +[02:18:54.324378] Epoch: [0] [2590/3229] lr: 0.000090 grad_norm: 0.5765 (0.7484) closs: 1.0785 (1.0929) time: 4.0603 data: 0.0002 max mem: 54684 +[02:19:35.831532] Epoch: [0] [2600/3229] lr: 0.000089 grad_norm: 0.5866 (0.7479) closs: 1.0738 (1.0928) time: 4.0817 data: 0.0002 max mem: 54684 +[02:20:16.483023] Epoch: [0] [2610/3229] lr: 0.000089 grad_norm: 0.6122 (0.7473) closs: 1.0686 (1.0925) time: 4.1079 data: 0.0002 max mem: 54684 +[02:20:56.936228] Epoch: [0] [2620/3229] lr: 0.000089 grad_norm: 0.5854 (0.7467) closs: 1.0702 (1.0924) time: 4.0552 data: 0.0002 max mem: 54684 +[02:21:38.121722] Epoch: [0] [2630/3229] lr: 0.000089 grad_norm: 0.5962 (0.7461) closs: 1.0889 (1.0925) time: 4.0819 data: 0.0002 max mem: 54684 +[02:22:19.198229] Epoch: [0] [2640/3229] lr: 0.000089 grad_norm: 0.6168 (0.7457) closs: 1.0931 (1.0924) time: 4.1130 data: 0.0002 max mem: 54684 +[02:22:59.823945] Epoch: [0] [2650/3229] lr: 0.000089 grad_norm: 0.5995 (0.7450) closs: 1.0365 (1.0922) time: 4.0850 data: 0.0002 max mem: 54684 +[02:23:40.933433] Epoch: [0] [2660/3229] lr: 0.000089 grad_norm: 0.5974 (0.7446) closs: 1.0331 (1.0920) time: 4.0867 data: 0.0002 max mem: 54684 +[02:24:21.838608] Epoch: [0] [2670/3229] lr: 0.000089 grad_norm: 0.6172 (0.7441) closs: 1.0431 (1.0918) time: 4.1007 data: 0.0002 max mem: 54684 +[02:25:02.759550] Epoch: [0] [2680/3229] lr: 0.000089 grad_norm: 0.6097 (0.7436) closs: 1.0506 (1.0917) time: 4.0912 data: 0.0002 max mem: 54684 +[02:25:43.386770] Epoch: [0] [2690/3229] lr: 0.000089 grad_norm: 0.5930 (0.7430) closs: 1.0564 (1.0916) time: 4.0773 data: 0.0002 max mem: 54684 +[02:26:23.853609] Epoch: [0] [2700/3229] lr: 0.000088 grad_norm: 0.5717 (0.7423) closs: 1.0674 (1.0915) time: 4.0546 data: 0.0002 max mem: 54684 +[02:27:04.832163] Epoch: [0] [2710/3229] lr: 0.000088 grad_norm: 0.5725 (0.7417) closs: 1.0830 (1.0914) time: 4.0722 data: 0.0002 max mem: 54684 +[02:27:46.111218] Epoch: [0] [2720/3229] lr: 0.000088 grad_norm: 0.5860 (0.7414) closs: 1.0830 (1.0914) time: 4.1128 data: 0.0002 max mem: 54684 +[02:28:27.035746] Epoch: [0] [2730/3229] lr: 0.000088 grad_norm: 0.5996 (0.7410) closs: 1.1146 (1.0913) time: 4.1101 data: 0.0002 max mem: 54684 +[02:29:07.490028] Epoch: [0] [2740/3229] lr: 0.000088 grad_norm: 0.6072 (0.7405) closs: 1.0595 (1.0911) time: 4.0689 data: 0.0002 max mem: 54684 +[02:29:48.509847] Epoch: [0] [2750/3229] lr: 0.000088 grad_norm: 0.6080 (0.7400) closs: 1.0567 (1.0910) time: 4.0736 data: 0.0002 max mem: 54684 +[02:30:29.784566] Epoch: [0] [2760/3229] lr: 0.000088 grad_norm: 0.5941 (0.7394) closs: 1.0694 (1.0909) time: 4.1147 data: 0.0002 max mem: 54684 +[02:31:10.397341] Epoch: [0] [2770/3229] lr: 0.000088 grad_norm: 0.5914 (0.7388) closs: 1.0694 (1.0907) time: 4.0943 data: 0.0002 max mem: 54684 +[02:31:50.521320] Epoch: [0] [2780/3229] lr: 0.000088 grad_norm: 0.5933 (0.7382) closs: 1.0263 (1.0903) time: 4.0368 data: 0.0002 max mem: 54684 +[02:32:31.199358] Epoch: [0] [2790/3229] lr: 0.000087 grad_norm: 0.5933 (0.7377) closs: 1.0229 (1.0902) time: 4.0400 data: 0.0002 max mem: 54684 +[02:33:11.823438] Epoch: [0] [2800/3229] lr: 0.000087 grad_norm: 0.5871 (0.7372) closs: 1.0686 (1.0900) time: 4.0650 data: 0.0002 max mem: 54684 +[02:33:52.788200] Epoch: [0] [2810/3229] lr: 0.000087 grad_norm: 0.5766 (0.7369) closs: 1.0498 (1.0899) time: 4.0794 data: 0.0002 max mem: 54684 +[02:34:33.570995] Epoch: [0] [2820/3229] lr: 0.000087 grad_norm: 0.5867 (0.7364) closs: 1.0498 (1.0897) time: 4.0873 data: 0.0002 max mem: 54684 +[02:35:14.475184] Epoch: [0] [2830/3229] lr: 0.000087 grad_norm: 0.5867 (0.7360) closs: 1.0506 (1.0897) time: 4.0843 data: 0.0002 max mem: 54684 +[02:35:55.072862] Epoch: [0] [2840/3229] lr: 0.000087 grad_norm: 0.5796 (0.7354) closs: 1.0448 (1.0896) time: 4.0750 data: 0.0002 max mem: 54684 +[02:36:36.343452] Epoch: [0] [2850/3229] lr: 0.000087 grad_norm: 0.5795 (0.7349) closs: 1.0341 (1.0895) time: 4.0934 data: 0.0002 max mem: 54684 +[02:37:16.453657] Epoch: [0] [2860/3229] lr: 0.000087 grad_norm: 0.5890 (0.7343) closs: 1.0884 (1.0894) time: 4.0690 data: 0.0002 max mem: 54684 +[02:37:57.847510] Epoch: [0] [2870/3229] lr: 0.000087 grad_norm: 0.6032 (0.7338) closs: 1.0884 (1.0893) time: 4.0751 data: 0.0002 max mem: 54684 +[02:38:38.666224] Epoch: [0] [2880/3229] lr: 0.000086 grad_norm: 0.6020 (0.7333) closs: 1.1037 (1.0894) time: 4.1106 data: 0.0002 max mem: 54684 +[02:39:19.313654] Epoch: [0] [2890/3229] lr: 0.000086 grad_norm: 0.5895 (0.7328) closs: 1.0991 (1.0892) time: 4.0732 data: 0.0002 max mem: 54684 +[02:40:00.426834] Epoch: [0] [2900/3229] lr: 0.000086 grad_norm: 0.5895 (0.7323) closs: 1.0807 (1.0892) time: 4.0880 data: 0.0002 max mem: 54684 +[02:40:41.449641] Epoch: [0] [2910/3229] lr: 0.000086 grad_norm: 0.6071 (0.7318) closs: 1.0677 (1.0890) time: 4.1067 data: 0.0002 max mem: 54684 +[02:41:22.717088] Epoch: [0] [2920/3229] lr: 0.000086 grad_norm: 0.5769 (0.7313) closs: 1.0795 (1.0890) time: 4.1145 data: 0.0002 max mem: 54684 +[02:42:03.989019] Epoch: [0] [2930/3229] lr: 0.000086 grad_norm: 0.6046 (0.7310) closs: 1.0829 (1.0889) time: 4.1269 data: 0.0002 max mem: 54684 +[02:42:44.765771] Epoch: [0] [2940/3229] lr: 0.000086 grad_norm: 0.6112 (0.7306) closs: 1.0782 (1.0890) time: 4.1024 data: 0.0002 max mem: 54684 +[02:43:26.121816] Epoch: [0] [2950/3229] lr: 0.000086 grad_norm: 0.5910 (0.7301) closs: 1.0738 (1.0889) time: 4.1066 data: 0.0002 max mem: 54684 +[02:44:06.980865] Epoch: [0] [2960/3229] lr: 0.000085 grad_norm: 0.5530 (0.7294) closs: 1.0462 (1.0887) time: 4.1107 data: 0.0002 max mem: 54684 +[02:44:47.929066] Epoch: [0] [2970/3229] lr: 0.000085 grad_norm: 0.5539 (0.7290) closs: 1.0193 (1.0885) time: 4.0903 data: 0.0002 max mem: 54684 +[02:45:29.038596] Epoch: [0] [2980/3229] lr: 0.000085 grad_norm: 0.5897 (0.7286) closs: 1.0596 (1.0885) time: 4.1028 data: 0.0002 max mem: 54684 +[02:46:10.337131] Epoch: [0] [2990/3229] lr: 0.000085 grad_norm: 0.6092 (0.7282) closs: 1.0325 (1.0882) time: 4.1203 data: 0.0002 max mem: 54684 +[02:46:51.160184] Epoch: [0] [3000/3229] lr: 0.000085 grad_norm: 0.6092 (0.7278) closs: 1.0017 (1.0880) time: 4.1060 data: 0.0002 max mem: 54684 +[02:47:32.092042] Epoch: [0] [3010/3229] lr: 0.000085 grad_norm: 0.5830 (0.7272) closs: 1.0432 (1.0880) time: 4.0877 data: 0.0002 max mem: 54684 +[02:48:13.191664] Epoch: [0] [3020/3229] lr: 0.000085 grad_norm: 0.5738 (0.7268) closs: 1.0643 (1.0879) time: 4.1015 data: 0.0002 max mem: 54684 +[02:48:53.607366] Epoch: [0] [3030/3229] lr: 0.000085 grad_norm: 0.5865 (0.7262) closs: 1.0520 (1.0876) time: 4.0757 data: 0.0002 max mem: 54684 +[02:49:34.770223] Epoch: [0] [3040/3229] lr: 0.000084 grad_norm: 0.5963 (0.7258) closs: 1.0520 (1.0875) time: 4.0789 data: 0.0002 max mem: 54684 +[02:50:15.718625] Epoch: [0] [3050/3229] lr: 0.000084 grad_norm: 0.5844 (0.7253) closs: 1.0391 (1.0874) time: 4.1055 data: 0.0002 max mem: 54684 +[02:50:56.816806] Epoch: [0] [3060/3229] lr: 0.000084 grad_norm: 0.5844 (0.7249) closs: 1.0391 (1.0873) time: 4.1023 data: 0.0002 max mem: 54684 +[02:51:38.020430] Epoch: [0] [3070/3229] lr: 0.000084 grad_norm: 0.5790 (0.7244) closs: 1.0698 (1.0873) time: 4.1150 data: 0.0002 max mem: 54684 +[02:52:18.623922] Epoch: [0] [3080/3229] lr: 0.000084 grad_norm: 0.5625 (0.7238) closs: 1.0568 (1.0871) time: 4.0903 data: 0.0002 max mem: 54684 +[02:52:59.070119] Epoch: [0] [3090/3229] lr: 0.000084 grad_norm: 0.5895 (0.7234) closs: 1.0600 (1.0870) time: 4.0524 data: 0.0002 max mem: 54684 +[02:53:39.601820] Epoch: [0] [3100/3229] lr: 0.000084 grad_norm: 0.5724 (0.7228) closs: 1.1135 (1.0870) time: 4.0488 data: 0.0002 max mem: 54684 +[02:54:20.332654] Epoch: [0] [3110/3229] lr: 0.000084 grad_norm: 0.5543 (0.7223) closs: 1.0445 (1.0869) time: 4.0631 data: 0.0002 max mem: 54684 +[02:55:01.284722] Epoch: [0] [3120/3229] lr: 0.000084 grad_norm: 0.5879 (0.7219) closs: 1.0445 (1.0867) time: 4.0841 data: 0.0002 max mem: 54684 +[02:55:41.419647] Epoch: [0] [3130/3229] lr: 0.000083 grad_norm: 0.5835 (0.7213) closs: 1.0551 (1.0865) time: 4.0543 data: 0.0002 max mem: 54684 +[02:56:21.204464] Epoch: [0] [3140/3229] lr: 0.000083 grad_norm: 0.5494 (0.7208) closs: 1.0305 (1.0863) time: 3.9959 data: 0.0002 max mem: 54684 +[02:57:02.592698] Epoch: [0] [3150/3229] lr: 0.000083 grad_norm: 0.5644 (0.7203) closs: 1.0305 (1.0862) time: 4.0586 data: 0.0002 max mem: 54684 +[02:57:42.889503] Epoch: [0] [3160/3229] lr: 0.000083 grad_norm: 0.5874 (0.7199) closs: 1.0279 (1.0860) time: 4.0842 data: 0.0002 max mem: 54684 +[02:58:23.673238] Epoch: [0] [3170/3229] lr: 0.000083 grad_norm: 0.5838 (0.7196) closs: 1.0648 (1.0859) time: 4.0540 data: 0.0002 max mem: 54684 +[02:59:04.635512] Epoch: [0] [3180/3229] lr: 0.000083 grad_norm: 0.6047 (0.7193) closs: 1.0742 (1.0858) time: 4.0872 data: 0.0002 max mem: 54684 +[02:59:45.556337] Epoch: [0] [3190/3229] lr: 0.000083 grad_norm: 0.5944 (0.7188) closs: 1.0606 (1.0857) time: 4.0941 data: 0.0002 max mem: 54684 +[03:00:26.818291] Epoch: [0] [3200/3229] lr: 0.000082 grad_norm: 0.5647 (0.7183) closs: 1.0673 (1.0856) time: 4.1091 data: 0.0003 max mem: 54684 +[03:01:07.594946] Epoch: [0] [3210/3229] lr: 0.000082 grad_norm: 0.5647 (0.7179) closs: 1.0773 (1.0855) time: 4.1019 data: 0.0003 max mem: 54684 +[03:01:48.926992] Epoch: [0] [3220/3229] lr: 0.000082 grad_norm: 0.5837 (0.7176) closs: 1.0710 (1.0855) time: 4.1054 data: 0.0001 max mem: 54684 +[03:02:22.158116] Epoch: [0] Total time: 3:39:54 +[03:02:22.159151] Averaged stats: lr: 0.000082 grad_norm: 0.5870 (0.7172) closs: 1.0331 (1.0847) +/data/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py:1110: UserWarning: ``FullyShardedDataParallel.full_optim_state_dict``is being deprecated and is replaced by ``FullyShardedDataParallel.optim_state_dict``. ``FullyShardedDataParallel.full_optim_state_dict`` may be removed after PyTorch 2.2. + warnings.warn( +/data/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py:1110: UserWarning: ``FullyShardedDataParallel.full_optim_state_dict``is being deprecated and is replaced by ``FullyShardedDataParallel.optim_state_dict``. ``FullyShardedDataParallel.full_optim_state_dict`` may be removed after PyTorch 2.2. + warnings.warn( +/data/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py:1110: UserWarning: ``FullyShardedDataParallel.full_optim_state_dict``is being deprecated and is replaced by ``FullyShardedDataParallel.optim_state_dict``. ``FullyShardedDataParallel.full_optim_state_dict`` may be removed after PyTorch 2.2. + warnings.warn( +/data/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py:1110: UserWarning: ``FullyShardedDataParallel.full_optim_state_dict``is being deprecated and is replaced by ``FullyShardedDataParallel.optim_state_dict``. ``FullyShardedDataParallel.full_optim_state_dict`` may be removed after PyTorch 2.2. + warnings.warn( +/data/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py:1110: UserWarning: ``FullyShardedDataParallel.full_optim_state_dict``is being deprecated and is replaced by ``FullyShardedDataParallel.optim_state_dict``. ``FullyShardedDataParallel.full_optim_state_dict`` may be removed after PyTorch 2.2. + warnings.warn( +/data/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py:1110: UserWarning: ``FullyShardedDataParallel.full_optim_state_dict``is being deprecated and is replaced by ``FullyShardedDataParallel.optim_state_dict``. ``FullyShardedDataParallel.full_optim_state_dict`` may be removed after PyTorch 2.2. + warnings.warn( +/data/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py:1110: UserWarning: ``FullyShardedDataParallel.full_optim_state_dict``is being deprecated and is replaced by ``FullyShardedDataParallel.optim_state_dict``. ``FullyShardedDataParallel.full_optim_state_dict`` may be removed after PyTorch 2.2. + warnings.warn( +[03:02:22.501825] model saved +/data/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py:1110: UserWarning: ``FullyShardedDataParallel.full_optim_state_dict``is being deprecated and is replaced by ``FullyShardedDataParallel.optim_state_dict``. ``FullyShardedDataParallel.full_optim_state_dict`` may be removed after PyTorch 2.2. + warnings.warn( +[03:02:24.206463] optimizer saved +[03:02:24.207016] other rank-common saved +[03:02:24.212083] rank-specific saved +[03:02:24.225861] log_dir: ./output_dir +[03:02:37.059687] Epoch: [1] [0/3229] lr: 0.000082 grad_norm: 0.6132 (0.6132) closs: 0.9999 (0.9999) time: 12.8331 data: 8.7530 max mem: 54684 +[03:03:18.632210] Epoch: [1] [10/3229] lr: 0.000082 grad_norm: 0.5647 (0.5912) closs: 1.0765 (1.0837) time: 4.9459 data: 0.7958 max mem: 54684 +[03:03:59.051367] Epoch: [1] [20/3229] lr: 0.000082 grad_norm: 0.5647 (0.5861) closs: 1.0516 (1.0566) time: 4.0995 data: 0.0001 max mem: 54684 +[03:04:40.503119] Epoch: [1] [30/3229] lr: 0.000082 grad_norm: 0.5915 (0.5939) closs: 1.0516 (1.0691) time: 4.0935 data: 0.0002 max mem: 54684 +[03:05:22.097537] Epoch: [1] [40/3229] lr: 0.000082 grad_norm: 0.5850 (0.5910) closs: 1.0820 (1.0720) time: 4.1522 data: 0.0002 max mem: 54684 +[03:06:03.090896] Epoch: [1] [50/3229] lr: 0.000081 grad_norm: 0.5815 (0.5900) closs: 1.0677 (1.0644) time: 4.1293 data: 0.0002 max mem: 54684 +[03:06:43.196597] Epoch: [1] [60/3229] lr: 0.000081 grad_norm: 0.5899 (0.5894) closs: 1.0078 (1.0537) time: 4.0549 data: 0.0002 max mem: 54684 +[03:07:24.613357] Epoch: [1] [70/3229] lr: 0.000081 grad_norm: 0.6045 (0.5897) closs: 1.0219 (1.0506) time: 4.0761 data: 0.0002 max mem: 54684 +[03:08:04.515157] Epoch: [1] [80/3229] lr: 0.000081 grad_norm: 0.5799 (0.5831) closs: 1.0161 (1.0428) time: 4.0659 data: 0.0002 max mem: 54684 +[03:08:45.467975] Epoch: [1] [90/3229] lr: 0.000081 grad_norm: 0.5556 (0.5835) closs: 1.0144 (1.0444) time: 4.0427 data: 0.0002 max mem: 54684 +[03:09:26.243171] Epoch: [1] [100/3229] lr: 0.000081 grad_norm: 0.5795 (0.5835) closs: 1.0435 (1.0434) time: 4.0863 data: 0.0002 max mem: 54684 +[03:10:08.038819] Epoch: [1] [110/3229] lr: 0.000081 grad_norm: 0.5723 (0.5835) closs: 1.0376 (1.0418) time: 4.1285 data: 0.0002 max mem: 54684 +[03:10:48.939404] Epoch: [1] [120/3229] lr: 0.000081 grad_norm: 0.5722 (0.5819) closs: 1.0377 (1.0430) time: 4.1347 data: 0.0002 max mem: 54684 +[03:11:30.055410] Epoch: [1] [130/3229] lr: 0.000080 grad_norm: 0.5803 (0.5823) closs: 1.0633 (1.0459) time: 4.1008 data: 0.0002 max mem: 54684 +[03:12:10.774513] Epoch: [1] [140/3229] lr: 0.000080 grad_norm: 0.5809 (0.5827) closs: 1.0694 (1.0474) time: 4.0917 data: 0.0002 max mem: 54684 +[03:12:51.481838] Epoch: [1] [150/3229] lr: 0.000080 grad_norm: 0.5672 (0.5807) closs: 1.0533 (1.0461) time: 4.0713 data: 0.0002 max mem: 54684 +[03:13:32.160702] Epoch: [1] [160/3229] lr: 0.000080 grad_norm: 0.5565 (0.5796) closs: 1.0420 (1.0467) time: 4.0692 data: 0.0001 max mem: 54684 +[03:14:12.777131] Epoch: [1] [170/3229] lr: 0.000080 grad_norm: 0.5744 (0.5800) closs: 1.0506 (1.0438) time: 4.0647 data: 0.0001 max mem: 54684 +[03:14:53.223464] Epoch: [1] [180/3229] lr: 0.000080 grad_norm: 0.6011 (0.5837) closs: 1.0397 (1.0437) time: 4.0531 data: 0.0002 max mem: 54684 +[03:15:34.236634] Epoch: [1] [190/3229] lr: 0.000080 grad_norm: 0.5986 (0.5832) closs: 1.0226 (1.0418) time: 4.0729 data: 0.0002 max mem: 54684 +[03:16:15.503140] Epoch: [1] [200/3229] lr: 0.000079 grad_norm: 0.5903 (0.5841) closs: 1.0284 (1.0425) time: 4.1139 data: 0.0002 max mem: 54684 +[03:16:56.870515] Epoch: [1] [210/3229] lr: 0.000079 grad_norm: 0.5972 (0.5856) closs: 1.0789 (1.0449) time: 4.1316 data: 0.0002 max mem: 54684 +[03:17:37.377683] Epoch: [1] [220/3229] lr: 0.000079 grad_norm: 0.6051 (0.5856) closs: 1.0734 (1.0451) time: 4.0937 data: 0.0002 max mem: 54684 +[03:18:17.897913] Epoch: [1] [230/3229] lr: 0.000079 grad_norm: 0.5783 (0.5854) closs: 1.0634 (1.0431) time: 4.0513 data: 0.0002 max mem: 54684 +[03:18:58.845292] Epoch: [1] [240/3229] lr: 0.000079 grad_norm: 0.5760 (0.5856) closs: 1.0062 (1.0415) time: 4.0733 data: 0.0002 max mem: 54684 +[03:19:40.143287] Epoch: [1] [250/3229] lr: 0.000079 grad_norm: 0.5830 (0.5858) closs: 1.0155 (1.0413) time: 4.1122 data: 0.0002 max mem: 54684 +[03:20:21.253868] Epoch: [1] [260/3229] lr: 0.000079 grad_norm: 0.5724 (0.5857) closs: 1.0472 (1.0420) time: 4.1204 data: 0.0002 max mem: 54684 +[03:21:02.310900] Epoch: [1] [270/3229] lr: 0.000079 grad_norm: 0.5693 (0.5852) closs: 1.0750 (1.0425) time: 4.1083 data: 0.0002 max mem: 54684 +[03:21:42.925981] Epoch: [1] [280/3229] lr: 0.000078 grad_norm: 0.5739 (0.5850) closs: 1.0208 (1.0422) time: 4.0835 data: 0.0002 max mem: 54684 +[03:22:23.870461] Epoch: [1] [290/3229] lr: 0.000078 grad_norm: 0.5811 (0.5850) closs: 1.0208 (1.0421) time: 4.0779 data: 0.0002 max mem: 54684 +[03:23:03.649607] Epoch: [1] [300/3229] lr: 0.000078 grad_norm: 0.5807 (0.5838) closs: 1.0100 (1.0409) time: 4.0361 data: 0.0002 max mem: 54684 +[03:23:44.397242] Epoch: [1] [310/3229] lr: 0.000078 grad_norm: 0.5614 (0.5834) closs: 0.9924 (1.0399) time: 4.0263 data: 0.0002 max mem: 54684 +[03:24:25.692114] Epoch: [1] [320/3229] lr: 0.000078 grad_norm: 0.5705 (0.5838) closs: 1.0024 (1.0397) time: 4.1021 data: 0.0002 max mem: 54684 +[03:25:06.316947] Epoch: [1] [330/3229] lr: 0.000078 grad_norm: 0.5706 (0.5832) closs: 1.0348 (1.0391) time: 4.0959 data: 0.0002 max mem: 54684 +[03:25:47.408916] Epoch: [1] [340/3229] lr: 0.000078 grad_norm: 0.5612 (0.5823) closs: 1.0617 (1.0411) time: 4.0858 data: 0.0002 max mem: 54684 +[03:26:27.973524] Epoch: [1] [350/3229] lr: 0.000077 grad_norm: 0.5532 (0.5812) closs: 1.0858 (1.0398) time: 4.0828 data: 0.0002 max mem: 54684 +[03:27:08.836154] Epoch: [1] [360/3229] lr: 0.000077 grad_norm: 0.5532 (0.5813) closs: 1.0425 (1.0395) time: 4.0713 data: 0.0002 max mem: 54684 +[03:27:50.188867] Epoch: [1] [370/3229] lr: 0.000077 grad_norm: 0.5685 (0.5813) closs: 1.0543 (1.0401) time: 4.1107 data: 0.0002 max mem: 54684 +[03:28:31.322423] Epoch: [1] [380/3229] lr: 0.000077 grad_norm: 0.5652 (0.5814) closs: 1.0788 (1.0413) time: 4.1243 data: 0.0002 max mem: 54684 +[03:29:11.945890] Epoch: [1] [390/3229] lr: 0.000077 grad_norm: 0.5715 (0.5806) closs: 1.0404 (1.0400) time: 4.0878 data: 0.0002 max mem: 54684 +[03:29:52.453441] Epoch: [1] [400/3229] lr: 0.000077 grad_norm: 0.5882 (0.5809) closs: 0.9902 (1.0388) time: 4.0565 data: 0.0002 max mem: 54684 +[03:30:33.398665] Epoch: [1] [410/3229] lr: 0.000077 grad_norm: 0.5950 (0.5812) closs: 1.0371 (1.0393) time: 4.0726 data: 0.0002 max mem: 54684 +[03:31:13.860269] Epoch: [1] [420/3229] lr: 0.000076 grad_norm: 0.5764 (0.5808) closs: 1.0707 (1.0392) time: 4.0703 data: 0.0002 max mem: 54684 +[03:31:55.414571] Epoch: [1] [430/3229] lr: 0.000076 grad_norm: 0.5494 (0.5805) closs: 1.0707 (1.0403) time: 4.1007 data: 0.0002 max mem: 54684 +[03:32:35.855299] Epoch: [1] [440/3229] lr: 0.000076 grad_norm: 0.5548 (0.5802) closs: 1.0658 (1.0402) time: 4.0997 data: 0.0002 max mem: 54684 +[03:33:16.452210] Epoch: [1] [450/3229] lr: 0.000076 grad_norm: 0.5614 (0.5798) closs: 1.0492 (1.0401) time: 4.0518 data: 0.0004 max mem: 54684 +[03:33:57.234098] Epoch: [1] [460/3229] lr: 0.000076 grad_norm: 0.5710 (0.5794) closs: 1.0546 (1.0406) time: 4.0689 data: 0.0004 max mem: 54684 +[03:34:38.211753] Epoch: [1] [470/3229] lr: 0.000076 grad_norm: 0.5683 (0.5792) closs: 1.0546 (1.0399) time: 4.0879 data: 0.0002 max mem: 54684 +[03:35:19.148555] Epoch: [1] [480/3229] lr: 0.000076 grad_norm: 0.5668 (0.5788) closs: 1.0823 (1.0410) time: 4.0957 data: 0.0002 max mem: 54684 +[03:35:59.926822] Epoch: [1] [490/3229] lr: 0.000075 grad_norm: 0.5872 (0.5795) closs: 1.0733 (1.0407) time: 4.0857 data: 0.0002 max mem: 54684 +[03:36:40.816831] Epoch: [1] [500/3229] lr: 0.000075 grad_norm: 0.5882 (0.5796) closs: 1.0178 (1.0404) time: 4.0833 data: 0.0002 max mem: 54684 +[03:37:21.882692] Epoch: [1] [510/3229] lr: 0.000075 grad_norm: 0.5975 (0.5799) closs: 1.0400 (1.0405) time: 4.0977 data: 0.0002 max mem: 54684 +[03:38:02.986718] Epoch: [1] [520/3229] lr: 0.000075 grad_norm: 0.5864 (0.5798) closs: 1.0698 (1.0415) time: 4.1084 data: 0.0002 max mem: 54684 +[03:38:44.292702] Epoch: [1] [530/3229] lr: 0.000075 grad_norm: 0.5751 (0.5799) closs: 1.0699 (1.0420) time: 4.1204 data: 0.0002 max mem: 54684 +[03:39:25.188063] Epoch: [1] [540/3229] lr: 0.000075 grad_norm: 0.5619 (0.5795) closs: 1.0750 (1.0422) time: 4.1100 data: 0.0002 max mem: 54684 +[03:40:06.326097] Epoch: [1] [550/3229] lr: 0.000075 grad_norm: 0.5697 (0.5793) closs: 1.0631 (1.0421) time: 4.1016 data: 0.0002 max mem: 54684 +[03:40:46.941861] Epoch: [1] [560/3229] lr: 0.000074 grad_norm: 0.5769 (0.5789) closs: 1.0369 (1.0421) time: 4.0876 data: 0.0002 max mem: 54684 +[03:41:27.715194] Epoch: [1] [570/3229] lr: 0.000074 grad_norm: 0.5715 (0.5790) closs: 1.0353 (1.0420) time: 4.0694 data: 0.0002 max mem: 54684 +[03:42:08.311161] Epoch: [1] [580/3229] lr: 0.000074 grad_norm: 0.5662 (0.5787) closs: 1.0276 (1.0416) time: 4.0684 data: 0.0002 max mem: 54684 +[03:42:49.459226] Epoch: [1] [590/3229] lr: 0.000074 grad_norm: 0.5645 (0.5789) closs: 1.0383 (1.0416) time: 4.0871 data: 0.0002 max mem: 54684 +[03:43:29.764469] Epoch: [1] [600/3229] lr: 0.000074 grad_norm: 0.5632 (0.5782) closs: 1.0646 (1.0416) time: 4.0726 data: 0.0002 max mem: 54684 +[03:44:10.863831] Epoch: [1] [610/3229] lr: 0.000074 grad_norm: 0.5650 (0.5784) closs: 1.0527 (1.0416) time: 4.0702 data: 0.0002 max mem: 54684 +[03:44:52.100000] Epoch: [1] [620/3229] lr: 0.000073 grad_norm: 0.5675 (0.5784) closs: 1.0475 (1.0417) time: 4.1167 data: 0.0002 max mem: 54684 +[03:45:32.780967] Epoch: [1] [630/3229] lr: 0.000073 grad_norm: 0.5615 (0.5781) closs: 1.0306 (1.0406) time: 4.0958 data: 0.0002 max mem: 54684 +[03:46:13.717184] Epoch: [1] [640/3229] lr: 0.000073 grad_norm: 0.5615 (0.5779) closs: 1.0113 (1.0406) time: 4.0808 data: 0.0002 max mem: 54684 +[03:46:54.518077] Epoch: [1] [650/3229] lr: 0.000073 grad_norm: 0.5599 (0.5776) closs: 1.0253 (1.0406) time: 4.0868 data: 0.0002 max mem: 54684 +[03:47:35.653148] Epoch: [1] [660/3229] lr: 0.000073 grad_norm: 0.5541 (0.5776) closs: 1.0473 (1.0408) time: 4.0967 data: 0.0002 max mem: 54684 +[03:48:16.527626] Epoch: [1] [670/3229] lr: 0.000073 grad_norm: 0.5589 (0.5774) closs: 1.0642 (1.0410) time: 4.1004 data: 0.0002 max mem: 54684 +[03:48:57.838597] Epoch: [1] [680/3229] lr: 0.000073 grad_norm: 0.5645 (0.5773) closs: 1.0592 (1.0413) time: 4.1092 data: 0.0002 max mem: 54684 +[03:49:38.297230] Epoch: [1] [690/3229] lr: 0.000072 grad_norm: 0.5645 (0.5770) closs: 1.0436 (1.0416) time: 4.0884 data: 0.0002 max mem: 54684 +[03:50:19.056069] Epoch: [1] [700/3229] lr: 0.000072 grad_norm: 0.5649 (0.5768) closs: 1.0510 (1.0413) time: 4.0608 data: 0.0002 max mem: 54684 +[03:50:59.983545] Epoch: [1] [710/3229] lr: 0.000072 grad_norm: 0.5638 (0.5763) closs: 1.0680 (1.0411) time: 4.0843 data: 0.0002 max mem: 54684 +[03:51:41.244969] Epoch: [1] [720/3229] lr: 0.000072 grad_norm: 0.5580 (0.5763) closs: 1.0682 (1.0419) time: 4.1094 data: 0.0002 max mem: 54684 +[03:52:22.017980] Epoch: [1] [730/3229] lr: 0.000072 grad_norm: 0.5673 (0.5762) closs: 1.0718 (1.0422) time: 4.1017 data: 0.0002 max mem: 54684 +[03:53:02.115581] Epoch: [1] [740/3229] lr: 0.000072 grad_norm: 0.5340 (0.5757) closs: 1.0090 (1.0416) time: 4.0435 data: 0.0002 max mem: 54684 +[03:53:43.063539] Epoch: [1] [750/3229] lr: 0.000072 grad_norm: 0.5403 (0.5754) closs: 1.0249 (1.0419) time: 4.0522 data: 0.0002 max mem: 54684 +[03:54:24.299312] Epoch: [1] [760/3229] lr: 0.000071 grad_norm: 0.5616 (0.5762) closs: 1.0686 (1.0422) time: 4.1091 data: 0.0002 max mem: 54684 +[03:55:05.072533] Epoch: [1] [770/3229] lr: 0.000071 grad_norm: 0.6088 (0.5766) closs: 1.0797 (1.0428) time: 4.1004 data: 0.0002 max mem: 54684 +[03:55:46.171867] Epoch: [1] [780/3229] lr: 0.000071 grad_norm: 0.5813 (0.5764) closs: 1.0868 (1.0434) time: 4.0936 data: 0.0002 max mem: 54684 +[03:56:27.138912] Epoch: [1] [790/3229] lr: 0.000071 grad_norm: 0.5429 (0.5760) closs: 1.0689 (1.0430) time: 4.1033 data: 0.0002 max mem: 54684 +[03:57:08.406933] Epoch: [1] [800/3229] lr: 0.000071 grad_norm: 0.5489 (0.5760) closs: 1.0240 (1.0432) time: 4.1117 data: 0.0002 max mem: 54684 +[03:57:48.557636] Epoch: [1] [810/3229] lr: 0.000071 grad_norm: 0.5489 (0.5756) closs: 1.0244 (1.0427) time: 4.0709 data: 0.0002 max mem: 54684 +[03:58:29.984938] Epoch: [1] [820/3229] lr: 0.000070 grad_norm: 0.5609 (0.5756) closs: 1.0117 (1.0426) time: 4.0788 data: 0.0002 max mem: 54684 +[03:59:10.607826] Epoch: [1] [830/3229] lr: 0.000070 grad_norm: 0.5660 (0.5753) closs: 1.0388 (1.0428) time: 4.1024 data: 0.0002 max mem: 54684 +[03:59:51.579481] Epoch: [1] [840/3229] lr: 0.000070 grad_norm: 0.5696 (0.5752) closs: 1.0748 (1.0434) time: 4.0797 data: 0.0002 max mem: 54684 +[04:00:32.354623] Epoch: [1] [850/3229] lr: 0.000070 grad_norm: 0.5696 (0.5751) closs: 1.0576 (1.0436) time: 4.0873 data: 0.0002 max mem: 54684 +[04:01:13.476385] Epoch: [1] [860/3229] lr: 0.000070 grad_norm: 0.5586 (0.5748) closs: 1.0391 (1.0437) time: 4.0948 data: 0.0002 max mem: 54684 +[04:01:54.062613] Epoch: [1] [870/3229] lr: 0.000070 grad_norm: 0.5624 (0.5746) closs: 1.0365 (1.0434) time: 4.0853 data: 0.0002 max mem: 54684 +[04:02:34.708901] Epoch: [1] [880/3229] lr: 0.000070 grad_norm: 0.5675 (0.5744) closs: 1.0142 (1.0432) time: 4.0616 data: 0.0002 max mem: 54684 +[04:03:15.525860] Epoch: [1] [890/3229] lr: 0.000069 grad_norm: 0.5504 (0.5742) closs: 1.0676 (1.0438) time: 4.0731 data: 0.0002 max mem: 54684 +[04:03:56.736301] Epoch: [1] [900/3229] lr: 0.000069 grad_norm: 0.5477 (0.5740) closs: 1.0676 (1.0439) time: 4.1013 data: 0.0002 max mem: 54684 +[04:04:36.920488] Epoch: [1] [910/3229] lr: 0.000069 grad_norm: 0.5477 (0.5738) closs: 1.0670 (1.0441) time: 4.0697 data: 0.0002 max mem: 54684 +[04:05:18.168891] Epoch: [1] [920/3229] lr: 0.000069 grad_norm: 0.5736 (0.5739) closs: 1.0477 (1.0440) time: 4.0716 data: 0.0002 max mem: 54684 +[04:05:59.267874] Epoch: [1] [930/3229] lr: 0.000069 grad_norm: 0.5780 (0.5739) closs: 1.0477 (1.0441) time: 4.1173 data: 0.0002 max mem: 54684 +[04:06:40.055537] Epoch: [1] [940/3229] lr: 0.000069 grad_norm: 0.5780 (0.5739) closs: 1.0047 (1.0436) time: 4.0943 data: 0.0002 max mem: 54684 +[04:07:21.013845] Epoch: [1] [950/3229] lr: 0.000068 grad_norm: 0.5719 (0.5737) closs: 0.9697 (1.0432) time: 4.0872 data: 0.0002 max mem: 54684 +[04:08:01.949372] Epoch: [1] [960/3229] lr: 0.000068 grad_norm: 0.5583 (0.5738) closs: 0.9883 (1.0433) time: 4.0946 data: 0.0002 max mem: 54684 +[04:08:43.032157] Epoch: [1] [970/3229] lr: 0.000068 grad_norm: 0.5604 (0.5737) closs: 1.0494 (1.0434) time: 4.1009 data: 0.0002 max mem: 54684 +[04:09:24.176825] Epoch: [1] [980/3229] lr: 0.000068 grad_norm: 0.5677 (0.5737) closs: 1.0494 (1.0436) time: 4.1113 data: 0.0002 max mem: 54684 +[04:10:05.360266] Epoch: [1] [990/3229] lr: 0.000068 grad_norm: 0.5701 (0.5737) closs: 1.0626 (1.0438) time: 4.1163 data: 0.0002 max mem: 54684 +[04:10:45.379904] Epoch: [1] [1000/3229] lr: 0.000068 grad_norm: 0.5526 (0.5731) closs: 1.0626 (1.0436) time: 4.0601 data: 0.0002 max mem: 54684 +[04:11:26.290114] Epoch: [1] [1010/3229] lr: 0.000068 grad_norm: 0.5378 (0.5730) closs: 1.0114 (1.0433) time: 4.0464 data: 0.0002 max mem: 54684 +[04:12:06.766731] Epoch: [1] [1020/3229] lr: 0.000067 grad_norm: 0.5378 (0.5726) closs: 1.0514 (1.0431) time: 4.0693 data: 0.0002 max mem: 54684 +[04:12:47.630579] Epoch: [1] [1030/3229] lr: 0.000067 grad_norm: 0.5303 (0.5722) closs: 1.0395 (1.0431) time: 4.0670 data: 0.0002 max mem: 54684 +[04:13:28.259482] Epoch: [1] [1040/3229] lr: 0.000067 grad_norm: 0.5331 (0.5720) closs: 1.0344 (1.0428) time: 4.0746 data: 0.0002 max mem: 54684 +[04:14:09.205642] Epoch: [1] [1050/3229] lr: 0.000067 grad_norm: 0.5576 (0.5721) closs: 1.0379 (1.0427) time: 4.0787 data: 0.0002 max mem: 54684 +[04:14:50.247334] Epoch: [1] [1060/3229] lr: 0.000067 grad_norm: 0.5656 (0.5720) closs: 1.0449 (1.0427) time: 4.0993 data: 0.0002 max mem: 54684 +[04:15:31.279333] Epoch: [1] [1070/3229] lr: 0.000067 grad_norm: 0.5656 (0.5720) closs: 1.0465 (1.0427) time: 4.1036 data: 0.0002 max mem: 54684 +[04:16:11.385132] Epoch: [1] [1080/3229] lr: 0.000066 grad_norm: 0.5653 (0.5719) closs: 1.0382 (1.0424) time: 4.0568 data: 0.0002 max mem: 54684 +[04:16:51.949602] Epoch: [1] [1090/3229] lr: 0.000066 grad_norm: 0.5558 (0.5717) closs: 1.0446 (1.0424) time: 4.0334 data: 0.0002 max mem: 54684 +[04:17:33.042818] Epoch: [1] [1100/3229] lr: 0.000066 grad_norm: 0.5585 (0.5716) closs: 1.0113 (1.0422) time: 4.0828 data: 0.0002 max mem: 54684 +[04:18:14.099932] Epoch: [1] [1110/3229] lr: 0.000066 grad_norm: 0.5712 (0.5716) closs: 1.0113 (1.0422) time: 4.1074 data: 0.0002 max mem: 54684 +[04:18:54.199882] Epoch: [1] [1120/3229] lr: 0.000066 grad_norm: 0.5757 (0.5718) closs: 1.0226 (1.0422) time: 4.0578 data: 0.0002 max mem: 54684 +[04:19:35.557680] Epoch: [1] [1130/3229] lr: 0.000066 grad_norm: 0.5680 (0.5719) closs: 1.0245 (1.0422) time: 4.0728 data: 0.0002 max mem: 54684 +[04:20:16.893984] Epoch: [1] [1140/3229] lr: 0.000065 grad_norm: 0.5726 (0.5720) closs: 1.0635 (1.0426) time: 4.1346 data: 0.0002 max mem: 54684 +[04:20:57.545187] Epoch: [1] [1150/3229] lr: 0.000065 grad_norm: 0.5726 (0.5718) closs: 1.0640 (1.0426) time: 4.0993 data: 0.0002 max mem: 54684 +[04:21:38.378059] Epoch: [1] [1160/3229] lr: 0.000065 grad_norm: 0.5435 (0.5716) closs: 1.0238 (1.0422) time: 4.0741 data: 0.0002 max mem: 54684 +[04:22:19.327764] Epoch: [1] [1170/3229] lr: 0.000065 grad_norm: 0.5553 (0.5715) closs: 1.0354 (1.0423) time: 4.0891 data: 0.0002 max mem: 54684 +[04:23:00.226177] Epoch: [1] [1180/3229] lr: 0.000065 grad_norm: 0.5553 (0.5713) closs: 1.0479 (1.0421) time: 4.0923 data: 0.0002 max mem: 54684 +[04:23:41.482756] Epoch: [1] [1190/3229] lr: 0.000065 grad_norm: 0.5714 (0.5714) closs: 1.0327 (1.0423) time: 4.1077 data: 0.0002 max mem: 54684 +[04:24:22.236750] Epoch: [1] [1200/3229] lr: 0.000065 grad_norm: 0.5960 (0.5716) closs: 1.0605 (1.0422) time: 4.1005 data: 0.0002 max mem: 54684 +[04:25:03.615465] Epoch: [1] [1210/3229] lr: 0.000064 grad_norm: 0.6099 (0.5717) closs: 1.0683 (1.0424) time: 4.1066 data: 0.0002 max mem: 54684 +[04:25:44.975618] Epoch: [1] [1220/3229] lr: 0.000064 grad_norm: 0.5627 (0.5715) closs: 1.0795 (1.0429) time: 4.1369 data: 0.0002 max mem: 54684 +[04:26:26.237903] Epoch: [1] [1230/3229] lr: 0.000064 grad_norm: 0.5499 (0.5714) closs: 1.0667 (1.0427) time: 4.1311 data: 0.0002 max mem: 54684 +[04:27:07.332364] Epoch: [1] [1240/3229] lr: 0.000064 grad_norm: 0.5514 (0.5713) closs: 1.0283 (1.0430) time: 4.1178 data: 0.0002 max mem: 54684 +[04:27:48.026409] Epoch: [1] [1250/3229] lr: 0.000064 grad_norm: 0.5443 (0.5710) closs: 1.0283 (1.0428) time: 4.0894 data: 0.0002 max mem: 54684 +[04:28:28.843640] Epoch: [1] [1260/3229] lr: 0.000064 grad_norm: 0.5485 (0.5708) closs: 1.0062 (1.0426) time: 4.0755 data: 0.0002 max mem: 54684 +[04:29:09.817124] Epoch: [1] [1270/3229] lr: 0.000063 grad_norm: 0.5490 (0.5707) closs: 0.9909 (1.0422) time: 4.0895 data: 0.0002 max mem: 54684 +[04:29:50.601796] Epoch: [1] [1280/3229] lr: 0.000063 grad_norm: 0.5495 (0.5705) closs: 1.0278 (1.0423) time: 4.0878 data: 0.0002 max mem: 54684 +[04:30:31.934810] Epoch: [1] [1290/3229] lr: 0.000063 grad_norm: 0.5482 (0.5705) closs: 1.0337 (1.0423) time: 4.1058 data: 0.0002 max mem: 54684 +[04:31:12.615588] Epoch: [1] [1300/3229] lr: 0.000063 grad_norm: 0.5511 (0.5707) closs: 1.0266 (1.0421) time: 4.1006 data: 0.0002 max mem: 54684 +[04:31:53.248997] Epoch: [1] [1310/3229] lr: 0.000063 grad_norm: 0.5585 (0.5706) closs: 1.0733 (1.0423) time: 4.0656 data: 0.0002 max mem: 54684 +[04:32:33.350563] Epoch: [1] [1320/3229] lr: 0.000063 grad_norm: 0.5479 (0.5704) closs: 1.0733 (1.0422) time: 4.0367 data: 0.0002 max mem: 54684 +[04:33:13.792288] Epoch: [1] [1330/3229] lr: 0.000062 grad_norm: 0.5475 (0.5701) closs: 1.0293 (1.0419) time: 4.0271 data: 0.0002 max mem: 54684 +[04:33:53.862733] Epoch: [1] [1340/3229] lr: 0.000062 grad_norm: 0.5442 (0.5697) closs: 1.0232 (1.0417) time: 4.0255 data: 0.0002 max mem: 54684 +[04:34:35.166000] Epoch: [1] [1350/3229] lr: 0.000062 grad_norm: 0.5631 (0.5697) closs: 1.0530 (1.0418) time: 4.0686 data: 0.0002 max mem: 54684 +[04:35:15.962952] Epoch: [1] [1360/3229] lr: 0.000062 grad_norm: 0.5428 (0.5694) closs: 1.0668 (1.0418) time: 4.1049 data: 0.0002 max mem: 54684 +[04:35:57.016930] Epoch: [1] [1370/3229] lr: 0.000062 grad_norm: 0.5385 (0.5692) closs: 1.0461 (1.0418) time: 4.0925 data: 0.0002 max mem: 54684 +[04:36:37.994351] Epoch: [1] [1380/3229] lr: 0.000062 grad_norm: 0.5438 (0.5692) closs: 1.0512 (1.0421) time: 4.1015 data: 0.0002 max mem: 54684 +[04:37:19.248670] Epoch: [1] [1390/3229] lr: 0.000061 grad_norm: 0.5749 (0.5692) closs: 1.0790 (1.0425) time: 4.1115 data: 0.0002 max mem: 54684 +[04:38:00.339847] Epoch: [1] [1400/3229] lr: 0.000061 grad_norm: 0.5749 (0.5692) closs: 1.0582 (1.0426) time: 4.1172 data: 0.0002 max mem: 54684 +[04:38:41.671763] Epoch: [1] [1410/3229] lr: 0.000061 grad_norm: 0.5611 (0.5692) closs: 1.0523 (1.0426) time: 4.1211 data: 0.0002 max mem: 54684 +[04:39:22.741937] Epoch: [1] [1420/3229] lr: 0.000061 grad_norm: 0.5466 (0.5694) closs: 1.0461 (1.0424) time: 4.1200 data: 0.0002 max mem: 54684 +[04:40:03.672318] Epoch: [1] [1430/3229] lr: 0.000061 grad_norm: 0.5480 (0.5693) closs: 1.0340 (1.0421) time: 4.1000 data: 0.0002 max mem: 54684 +[04:40:44.764299] Epoch: [1] [1440/3229] lr: 0.000061 grad_norm: 0.5512 (0.5694) closs: 1.0340 (1.0421) time: 4.1011 data: 0.0002 max mem: 54684 +[04:41:25.545847] Epoch: [1] [1450/3229] lr: 0.000061 grad_norm: 0.5475 (0.5692) closs: 1.0492 (1.0420) time: 4.0936 data: 0.0002 max mem: 54684 +[04:42:06.794631] Epoch: [1] [1460/3229] lr: 0.000060 grad_norm: 0.5468 (0.5692) closs: 1.0348 (1.0419) time: 4.1015 data: 0.0002 max mem: 54684 +[04:42:47.814991] Epoch: [1] [1470/3229] lr: 0.000060 grad_norm: 0.5492 (0.5690) closs: 1.0385 (1.0419) time: 4.1134 data: 0.0002 max mem: 54684 +[04:43:28.390855] Epoch: [1] [1480/3229] lr: 0.000060 grad_norm: 0.5438 (0.5687) closs: 1.0385 (1.0419) time: 4.0797 data: 0.0002 max mem: 54684 +[04:44:09.427647] Epoch: [1] [1490/3229] lr: 0.000060 grad_norm: 0.5306 (0.5687) closs: 1.0286 (1.0419) time: 4.0806 data: 0.0002 max mem: 54684 +[04:44:50.977883] Epoch: [1] [1500/3229] lr: 0.000060 grad_norm: 0.5577 (0.5687) closs: 1.0527 (1.0421) time: 4.1293 data: 0.0002 max mem: 54684 +[04:45:31.600069] Epoch: [1] [1510/3229] lr: 0.000060 grad_norm: 0.5509 (0.5684) closs: 1.0796 (1.0422) time: 4.1085 data: 0.0002 max mem: 54684 +[04:46:12.441511] Epoch: [1] [1520/3229] lr: 0.000059 grad_norm: 0.5530 (0.5684) closs: 1.0560 (1.0422) time: 4.0731 data: 0.0002 max mem: 54684 +[04:46:54.036109] Epoch: [1] [1530/3229] lr: 0.000059 grad_norm: 0.5716 (0.5684) closs: 1.0673 (1.0424) time: 4.1217 data: 0.0002 max mem: 54684 +[04:47:34.994855] Epoch: [1] [1540/3229] lr: 0.000059 grad_norm: 0.5716 (0.5684) closs: 1.0673 (1.0423) time: 4.1276 data: 0.0002 max mem: 54684 +[04:48:16.034968] Epoch: [1] [1550/3229] lr: 0.000059 grad_norm: 0.5361 (0.5683) closs: 1.0618 (1.0423) time: 4.0999 data: 0.0002 max mem: 54684 +[04:48:57.320961] Epoch: [1] [1560/3229] lr: 0.000059 grad_norm: 0.5683 (0.5682) closs: 1.0718 (1.0426) time: 4.1162 data: 0.0002 max mem: 54684 +[04:49:39.100283] Epoch: [1] [1570/3229] lr: 0.000059 grad_norm: 0.5620 (0.5682) closs: 1.0528 (1.0426) time: 4.1532 data: 0.0002 max mem: 54684 +[04:50:20.220872] Epoch: [1] [1580/3229] lr: 0.000058 grad_norm: 0.5433 (0.5680) closs: 1.0445 (1.0425) time: 4.1449 data: 0.0002 max mem: 54684 +[04:51:01.075841] Epoch: [1] [1590/3229] lr: 0.000058 grad_norm: 0.5502 (0.5680) closs: 1.0167 (1.0423) time: 4.0987 data: 0.0002 max mem: 54684 +[04:51:42.368473] Epoch: [1] [1600/3229] lr: 0.000058 grad_norm: 0.5734 (0.5682) closs: 1.0031 (1.0421) time: 4.1073 data: 0.0002 max mem: 54684 +[04:52:23.762144] Epoch: [1] [1610/3229] lr: 0.000058 grad_norm: 0.5806 (0.5683) closs: 1.0540 (1.0423) time: 4.1342 data: 0.0002 max mem: 54684 +[04:53:05.341366] Epoch: [1] [1620/3229] lr: 0.000058 grad_norm: 0.5798 (0.5684) closs: 1.0742 (1.0424) time: 4.1486 data: 0.0002 max mem: 54684 +[04:53:46.502603] Epoch: [1] [1630/3229] lr: 0.000058 grad_norm: 0.5766 (0.5685) closs: 1.0438 (1.0426) time: 4.1370 data: 0.0002 max mem: 54684 +[04:54:27.790824] Epoch: [1] [1640/3229] lr: 0.000057 grad_norm: 0.5684 (0.5684) closs: 1.0736 (1.0429) time: 4.1224 data: 0.0002 max mem: 54684 +[04:55:08.620731] Epoch: [1] [1650/3229] lr: 0.000057 grad_norm: 0.5621 (0.5683) closs: 1.0539 (1.0427) time: 4.1058 data: 0.0002 max mem: 54684 +[04:55:50.049173] Epoch: [1] [1660/3229] lr: 0.000057 grad_norm: 0.5620 (0.5683) closs: 1.0615 (1.0431) time: 4.1129 data: 0.0002 max mem: 54684 +[04:56:31.228990] Epoch: [1] [1670/3229] lr: 0.000057 grad_norm: 0.5627 (0.5682) closs: 1.0626 (1.0430) time: 4.1303 data: 0.0002 max mem: 54684 +[04:57:11.929105] Epoch: [1] [1680/3229] lr: 0.000057 grad_norm: 0.5532 (0.5681) closs: 1.0347 (1.0429) time: 4.0939 data: 0.0002 max mem: 54684 +[04:57:53.331536] Epoch: [1] [1690/3229] lr: 0.000057 grad_norm: 0.5532 (0.5681) closs: 1.0592 (1.0430) time: 4.1051 data: 0.0002 max mem: 54684 +[04:58:34.426701] Epoch: [1] [1700/3229] lr: 0.000056 grad_norm: 0.5586 (0.5680) closs: 1.0485 (1.0429) time: 4.1248 data: 0.0002 max mem: 54684 +[04:59:15.593463] Epoch: [1] [1710/3229] lr: 0.000056 grad_norm: 0.5681 (0.5680) closs: 1.0279 (1.0429) time: 4.1130 data: 0.0002 max mem: 54684 +[04:59:56.978578] Epoch: [1] [1720/3229] lr: 0.000056 grad_norm: 0.5599 (0.5681) closs: 1.0635 (1.0431) time: 4.1275 data: 0.0002 max mem: 54684 +[05:00:37.607286] Epoch: [1] [1730/3229] lr: 0.000056 grad_norm: 0.5583 (0.5679) closs: 1.0566 (1.0430) time: 4.1006 data: 0.0002 max mem: 54684 +[05:01:18.379952] Epoch: [1] [1740/3229] lr: 0.000056 grad_norm: 0.5404 (0.5678) closs: 1.0427 (1.0430) time: 4.0700 data: 0.0002 max mem: 54684 +[05:01:59.543844] Epoch: [1] [1750/3229] lr: 0.000056 grad_norm: 0.5516 (0.5678) closs: 1.0297 (1.0430) time: 4.0968 data: 0.0002 max mem: 54684 +[05:02:40.638109] Epoch: [1] [1760/3229] lr: 0.000055 grad_norm: 0.5584 (0.5677) closs: 1.0583 (1.0430) time: 4.1128 data: 0.0002 max mem: 54684 +[05:03:21.341435] Epoch: [1] [1770/3229] lr: 0.000055 grad_norm: 0.5533 (0.5677) closs: 1.0667 (1.0430) time: 4.0898 data: 0.0002 max mem: 54684 +[05:04:02.807448] Epoch: [1] [1780/3229] lr: 0.000055 grad_norm: 0.5304 (0.5675) closs: 1.0489 (1.0430) time: 4.1084 data: 0.0002 max mem: 54684 +[05:04:43.977861] Epoch: [1] [1790/3229] lr: 0.000055 grad_norm: 0.5479 (0.5675) closs: 1.0389 (1.0430) time: 4.1318 data: 0.0002 max mem: 54684 +[05:05:25.455837] Epoch: [1] [1800/3229] lr: 0.000055 grad_norm: 0.5531 (0.5675) closs: 1.0299 (1.0429) time: 4.1324 data: 0.0002 max mem: 54684 +[05:06:05.153859] Epoch: [1] [1810/3229] lr: 0.000055 grad_norm: 0.5444 (0.5672) closs: 0.9641 (1.0427) time: 4.0587 data: 0.0002 max mem: 54684 +[05:06:46.267998] Epoch: [1] [1820/3229] lr: 0.000054 grad_norm: 0.5334 (0.5672) closs: 1.0317 (1.0428) time: 4.0405 data: 0.0002 max mem: 54684 +[05:07:27.102663] Epoch: [1] [1830/3229] lr: 0.000054 grad_norm: 0.5543 (0.5671) closs: 1.0532 (1.0428) time: 4.0974 data: 0.0002 max mem: 54684 +[05:08:08.570305] Epoch: [1] [1840/3229] lr: 0.000054 grad_norm: 0.5617 (0.5671) closs: 1.0540 (1.0428) time: 4.1151 data: 0.0002 max mem: 54684 +[05:08:49.571553] Epoch: [1] [1850/3229] lr: 0.000054 grad_norm: 0.5632 (0.5671) closs: 1.0409 (1.0427) time: 4.1234 data: 0.0002 max mem: 54684 +[05:09:30.680753] Epoch: [1] [1860/3229] lr: 0.000054 grad_norm: 0.5681 (0.5671) closs: 1.0322 (1.0426) time: 4.1055 data: 0.0002 max mem: 54684 +[05:10:11.522470] Epoch: [1] [1870/3229] lr: 0.000054 grad_norm: 0.5681 (0.5671) closs: 1.0278 (1.0426) time: 4.0975 data: 0.0002 max mem: 54684 +[05:10:52.916261] Epoch: [1] [1880/3229] lr: 0.000053 grad_norm: 0.5520 (0.5670) closs: 1.0321 (1.0427) time: 4.1117 data: 0.0002 max mem: 54684 +[05:11:33.551352] Epoch: [1] [1890/3229] lr: 0.000053 grad_norm: 0.5682 (0.5670) closs: 1.0321 (1.0426) time: 4.1014 data: 0.0002 max mem: 54684 +[05:12:14.981387] Epoch: [1] [1900/3229] lr: 0.000053 grad_norm: 0.5668 (0.5670) closs: 1.0400 (1.0427) time: 4.1032 data: 0.0002 max mem: 54684 +[05:12:56.144297] Epoch: [1] [1910/3229] lr: 0.000053 grad_norm: 0.5629 (0.5670) closs: 1.0446 (1.0426) time: 4.1296 data: 0.0002 max mem: 54684 +[05:13:36.914042] Epoch: [1] [1920/3229] lr: 0.000053 grad_norm: 0.5629 (0.5671) closs: 1.0190 (1.0425) time: 4.0966 data: 0.0002 max mem: 54684 +[05:14:17.425182] Epoch: [1] [1930/3229] lr: 0.000053 grad_norm: 0.5425 (0.5668) closs: 1.0006 (1.0422) time: 4.0640 data: 0.0002 max mem: 54684 +[05:14:58.117995] Epoch: [1] [1940/3229] lr: 0.000052 grad_norm: 0.5504 (0.5667) closs: 1.0006 (1.0422) time: 4.0601 data: 0.0002 max mem: 54684 +[05:15:39.283905] Epoch: [1] [1950/3229] lr: 0.000052 grad_norm: 0.5645 (0.5668) closs: 1.0802 (1.0424) time: 4.0929 data: 0.0002 max mem: 54684 +[05:16:19.758558] Epoch: [1] [1960/3229] lr: 0.000052 grad_norm: 0.5571 (0.5666) closs: 1.0615 (1.0424) time: 4.0820 data: 0.0002 max mem: 54684 +[05:17:01.117349] Epoch: [1] [1970/3229] lr: 0.000052 grad_norm: 0.5513 (0.5667) closs: 1.0395 (1.0423) time: 4.0916 data: 0.0002 max mem: 54684 +[05:17:41.172799] Epoch: [1] [1980/3229] lr: 0.000052 grad_norm: 0.5372 (0.5664) closs: 1.0187 (1.0421) time: 4.0706 data: 0.0002 max mem: 54684 +[05:18:22.369121] Epoch: [1] [1990/3229] lr: 0.000052 grad_norm: 0.5372 (0.5664) closs: 1.0199 (1.0420) time: 4.0625 data: 0.0002 max mem: 54684 +[05:19:03.491186] Epoch: [1] [2000/3229] lr: 0.000051 grad_norm: 0.5705 (0.5664) closs: 1.0474 (1.0420) time: 4.1159 data: 0.0002 max mem: 54684 +[05:19:44.836678] Epoch: [1] [2010/3229] lr: 0.000051 grad_norm: 0.5739 (0.5665) closs: 1.0563 (1.0420) time: 4.1233 data: 0.0002 max mem: 54684 +[05:20:26.174525] Epoch: [1] [2020/3229] lr: 0.000051 grad_norm: 0.5739 (0.5664) closs: 1.0676 (1.0422) time: 4.1341 data: 0.0002 max mem: 54684 +[05:21:07.005141] Epoch: [1] [2030/3229] lr: 0.000051 grad_norm: 0.5539 (0.5664) closs: 1.0687 (1.0421) time: 4.1084 data: 0.0002 max mem: 54684 +[05:21:47.805581] Epoch: [1] [2040/3229] lr: 0.000051 grad_norm: 0.5457 (0.5662) closs: 1.0106 (1.0419) time: 4.0815 data: 0.0002 max mem: 54684 +[05:22:28.761748] Epoch: [1] [2050/3229] lr: 0.000051 grad_norm: 0.5360 (0.5659) closs: 1.0411 (1.0418) time: 4.0878 data: 0.0002 max mem: 54684 +[05:23:09.259309] Epoch: [1] [2060/3229] lr: 0.000050 grad_norm: 0.5417 (0.5658) closs: 1.0135 (1.0417) time: 4.0726 data: 0.0002 max mem: 54684 +[05:23:49.900804] Epoch: [1] [2070/3229] lr: 0.000050 grad_norm: 0.5499 (0.5657) closs: 1.0281 (1.0417) time: 4.0569 data: 0.0002 max mem: 54684 +[05:24:30.222114] Epoch: [1] [2080/3229] lr: 0.000050 grad_norm: 0.5455 (0.5656) closs: 1.0377 (1.0416) time: 4.0481 data: 0.0002 max mem: 54684 +[05:25:10.002823] Epoch: [1] [2090/3229] lr: 0.000050 grad_norm: 0.5285 (0.5654) closs: 0.9802 (1.0412) time: 4.0050 data: 0.0002 max mem: 54684 +[05:25:51.027399] Epoch: [1] [2100/3229] lr: 0.000050 grad_norm: 0.5517 (0.5654) closs: 0.9688 (1.0411) time: 4.0402 data: 0.0002 max mem: 54684 +[05:26:31.682298] Epoch: [1] [2110/3229] lr: 0.000050 grad_norm: 0.5534 (0.5652) closs: 1.0242 (1.0409) time: 4.0839 data: 0.0002 max mem: 54684 +[05:27:13.027787] Epoch: [1] [2120/3229] lr: 0.000049 grad_norm: 0.5357 (0.5652) closs: 1.0134 (1.0408) time: 4.1000 data: 0.0002 max mem: 54684 +[05:27:53.584846] Epoch: [1] [2130/3229] lr: 0.000049 grad_norm: 0.5528 (0.5651) closs: 1.0308 (1.0409) time: 4.0951 data: 0.0002 max mem: 54684 +[05:28:33.774170] Epoch: [1] [2140/3229] lr: 0.000049 grad_norm: 0.5575 (0.5650) closs: 1.0479 (1.0409) time: 4.0373 data: 0.0002 max mem: 54684 +[05:29:15.125615] Epoch: [1] [2150/3229] lr: 0.000049 grad_norm: 0.5506 (0.5650) closs: 1.0479 (1.0411) time: 4.0770 data: 0.0002 max mem: 54684 +[05:29:56.131252] Epoch: [1] [2160/3229] lr: 0.000049 grad_norm: 0.5534 (0.5650) closs: 1.0885 (1.0414) time: 4.1178 data: 0.0002 max mem: 54684 +[05:30:37.305299] Epoch: [1] [2170/3229] lr: 0.000049 grad_norm: 0.5663 (0.5649) closs: 1.0510 (1.0414) time: 4.1089 data: 0.0002 max mem: 54684 +[05:31:18.140992] Epoch: [1] [2180/3229] lr: 0.000049 grad_norm: 0.5663 (0.5650) closs: 1.0392 (1.0412) time: 4.1004 data: 0.0002 max mem: 54684 +[05:31:58.773232] Epoch: [1] [2190/3229] lr: 0.000048 grad_norm: 0.5678 (0.5649) closs: 1.0366 (1.0413) time: 4.0733 data: 0.0002 max mem: 54684 +[05:32:39.494385] Epoch: [1] [2200/3229] lr: 0.000048 grad_norm: 0.5526 (0.5648) closs: 1.0355 (1.0412) time: 4.0676 data: 0.0002 max mem: 54684 +[05:33:20.916278] Epoch: [1] [2210/3229] lr: 0.000048 grad_norm: 0.5766 (0.5649) closs: 1.0259 (1.0413) time: 4.1071 data: 0.0002 max mem: 54684 +[05:34:01.793686] Epoch: [1] [2220/3229] lr: 0.000048 grad_norm: 0.5669 (0.5648) closs: 1.0546 (1.0414) time: 4.1149 data: 0.0002 max mem: 54684 +[05:34:42.236506] Epoch: [1] [2230/3229] lr: 0.000048 grad_norm: 0.5328 (0.5647) closs: 1.0481 (1.0414) time: 4.0659 data: 0.0002 max mem: 54684 +[05:35:22.940509] Epoch: [1] [2240/3229] lr: 0.000048 grad_norm: 0.5445 (0.5646) closs: 1.0309 (1.0413) time: 4.0573 data: 0.0002 max mem: 54684 +[05:36:03.732476] Epoch: [1] [2250/3229] lr: 0.000047 grad_norm: 0.5293 (0.5645) closs: 1.0312 (1.0413) time: 4.0747 data: 0.0002 max mem: 54684 +[05:36:44.583014] Epoch: [1] [2260/3229] lr: 0.000047 grad_norm: 0.5293 (0.5645) closs: 1.0900 (1.0415) time: 4.0821 data: 0.0002 max mem: 54684 +[05:37:25.753741] Epoch: [1] [2270/3229] lr: 0.000047 grad_norm: 0.5454 (0.5645) closs: 1.0398 (1.0414) time: 4.1010 data: 0.0002 max mem: 54684 +[05:38:06.354374] Epoch: [1] [2280/3229] lr: 0.000047 grad_norm: 0.5742 (0.5645) closs: 1.0278 (1.0414) time: 4.0885 data: 0.0002 max mem: 54684 +[05:38:47.481707] Epoch: [1] [2290/3229] lr: 0.000047 grad_norm: 0.5408 (0.5643) closs: 1.0250 (1.0412) time: 4.0863 data: 0.0002 max mem: 54684 +[05:39:28.305137] Epoch: [1] [2300/3229] lr: 0.000047 grad_norm: 0.5408 (0.5643) closs: 1.0250 (1.0412) time: 4.0975 data: 0.0002 max mem: 54684 +[05:40:09.396276] Epoch: [1] [2310/3229] lr: 0.000046 grad_norm: 0.5551 (0.5642) closs: 1.0423 (1.0412) time: 4.0957 data: 0.0002 max mem: 54684 +[05:40:50.316973] Epoch: [1] [2320/3229] lr: 0.000046 grad_norm: 0.5588 (0.5643) closs: 1.0553 (1.0412) time: 4.1005 data: 0.0002 max mem: 54684 +[05:41:31.693870] Epoch: [1] [2330/3229] lr: 0.000046 grad_norm: 0.5540 (0.5642) closs: 1.0652 (1.0413) time: 4.1148 data: 0.0002 max mem: 54684 +[05:42:12.473663] Epoch: [1] [2340/3229] lr: 0.000046 grad_norm: 0.5477 (0.5642) closs: 1.0847 (1.0415) time: 4.1078 data: 0.0002 max mem: 54684 +[05:42:53.480264] Epoch: [1] [2350/3229] lr: 0.000046 grad_norm: 0.5727 (0.5644) closs: 1.0970 (1.0416) time: 4.0893 data: 0.0002 max mem: 54684 +[05:43:33.754469] Epoch: [1] [2360/3229] lr: 0.000046 grad_norm: 0.5439 (0.5642) closs: 1.0598 (1.0416) time: 4.0640 data: 0.0002 max mem: 54684 +[05:44:14.489959] Epoch: [1] [2370/3229] lr: 0.000045 grad_norm: 0.5382 (0.5642) closs: 1.0300 (1.0414) time: 4.0504 data: 0.0002 max mem: 54684 +[05:44:55.310165] Epoch: [1] [2380/3229] lr: 0.000045 grad_norm: 0.5399 (0.5642) closs: 1.0297 (1.0415) time: 4.0777 data: 0.0002 max mem: 54684 +[05:45:35.666164] Epoch: [1] [2390/3229] lr: 0.000045 grad_norm: 0.5510 (0.5642) closs: 1.0400 (1.0414) time: 4.0587 data: 0.0002 max mem: 54684 +[05:46:16.639176] Epoch: [1] [2400/3229] lr: 0.000045 grad_norm: 0.5578 (0.5642) closs: 1.0323 (1.0413) time: 4.0664 data: 0.0002 max mem: 54684 +[05:46:56.787546] Epoch: [1] [2410/3229] lr: 0.000045 grad_norm: 0.5509 (0.5640) closs: 1.0328 (1.0414) time: 4.0560 data: 0.0002 max mem: 54684 +[05:47:37.581569] Epoch: [1] [2420/3229] lr: 0.000045 grad_norm: 0.5329 (0.5640) closs: 1.0172 (1.0412) time: 4.0471 data: 0.0002 max mem: 54684 +[05:48:18.952327] Epoch: [1] [2430/3229] lr: 0.000044 grad_norm: 0.5562 (0.5640) closs: 1.0190 (1.0412) time: 4.1082 data: 0.0002 max mem: 54684 +[05:48:59.548816] Epoch: [1] [2440/3229] lr: 0.000044 grad_norm: 0.5517 (0.5639) closs: 1.0202 (1.0412) time: 4.0983 data: 0.0002 max mem: 54684 +[05:49:40.816318] Epoch: [1] [2450/3229] lr: 0.000044 grad_norm: 0.5512 (0.5639) closs: 1.0606 (1.0414) time: 4.0931 data: 0.0002 max mem: 54684 +[05:50:21.612829] Epoch: [1] [2460/3229] lr: 0.000044 grad_norm: 0.5661 (0.5639) closs: 1.0619 (1.0415) time: 4.1031 data: 0.0002 max mem: 54684 +[05:51:02.032606] Epoch: [1] [2470/3229] lr: 0.000044 grad_norm: 0.5228 (0.5637) closs: 1.0373 (1.0414) time: 4.0607 data: 0.0002 max mem: 54684 +[05:51:42.669343] Epoch: [1] [2480/3229] lr: 0.000044 grad_norm: 0.5141 (0.5637) closs: 1.0318 (1.0414) time: 4.0528 data: 0.0002 max mem: 54684 +[05:52:23.947666] Epoch: [1] [2490/3229] lr: 0.000043 grad_norm: 0.5556 (0.5637) closs: 1.0080 (1.0413) time: 4.0957 data: 0.0002 max mem: 54684 +[05:53:04.419032] Epoch: [1] [2500/3229] lr: 0.000043 grad_norm: 0.5681 (0.5636) closs: 0.9925 (1.0411) time: 4.0874 data: 0.0002 max mem: 54684 +[05:53:44.810109] Epoch: [1] [2510/3229] lr: 0.000043 grad_norm: 0.5536 (0.5635) closs: 1.0025 (1.0409) time: 4.0431 data: 0.0002 max mem: 54684 +[05:54:25.457976] Epoch: [1] [2520/3229] lr: 0.000043 grad_norm: 0.5589 (0.5635) closs: 1.0145 (1.0408) time: 4.0519 data: 0.0002 max mem: 54684 +[05:55:06.177805] Epoch: [1] [2530/3229] lr: 0.000043 grad_norm: 0.5589 (0.5634) closs: 1.0310 (1.0409) time: 4.0683 data: 0.0002 max mem: 54684 +[05:55:46.968680] Epoch: [1] [2540/3229] lr: 0.000043 grad_norm: 0.5613 (0.5634) closs: 1.0699 (1.0409) time: 4.0755 data: 0.0002 max mem: 54684 +[05:56:28.310551] Epoch: [1] [2550/3229] lr: 0.000042 grad_norm: 0.5632 (0.5634) closs: 1.0699 (1.0410) time: 4.1066 data: 0.0002 max mem: 54684 +[05:57:08.977358] Epoch: [1] [2560/3229] lr: 0.000042 grad_norm: 0.5632 (0.5634) closs: 1.0260 (1.0409) time: 4.1004 data: 0.0002 max mem: 54684 +[05:57:49.989591] Epoch: [1] [2570/3229] lr: 0.000042 grad_norm: 0.5523 (0.5633) closs: 1.0473 (1.0411) time: 4.0839 data: 0.0002 max mem: 54684 +[05:58:31.203250] Epoch: [1] [2580/3229] lr: 0.000042 grad_norm: 0.5427 (0.5633) closs: 1.0885 (1.0413) time: 4.1112 data: 0.0002 max mem: 54684 +[05:59:12.459151] Epoch: [1] [2590/3229] lr: 0.000042 grad_norm: 0.5640 (0.5634) closs: 1.0807 (1.0414) time: 4.1234 data: 0.0002 max mem: 54684 +[05:59:53.419598] Epoch: [1] [2600/3229] lr: 0.000042 grad_norm: 0.5771 (0.5633) closs: 1.0600 (1.0413) time: 4.1108 data: 0.0002 max mem: 54684 +[06:00:34.153034] Epoch: [1] [2610/3229] lr: 0.000041 grad_norm: 0.5546 (0.5632) closs: 0.9869 (1.0412) time: 4.0846 data: 0.0002 max mem: 54684 +[06:01:15.086879] Epoch: [1] [2620/3229] lr: 0.000041 grad_norm: 0.5557 (0.5632) closs: 0.9869 (1.0411) time: 4.0833 data: 0.0002 max mem: 54684 +[06:01:56.283580] Epoch: [1] [2630/3229] lr: 0.000041 grad_norm: 0.5558 (0.5632) closs: 1.0490 (1.0413) time: 4.1065 data: 0.0002 max mem: 54684 +[06:02:37.719061] Epoch: [1] [2640/3229] lr: 0.000041 grad_norm: 0.5613 (0.5632) closs: 1.0643 (1.0413) time: 4.1315 data: 0.0002 max mem: 54684 +[06:03:18.915077] Epoch: [1] [2650/3229] lr: 0.000041 grad_norm: 0.5604 (0.5632) closs: 1.0350 (1.0413) time: 4.1315 data: 0.0002 max mem: 54684 +[06:03:59.811949] Epoch: [1] [2660/3229] lr: 0.000041 grad_norm: 0.5654 (0.5632) closs: 1.0307 (1.0414) time: 4.1046 data: 0.0002 max mem: 54684 +[06:04:40.786635] Epoch: [1] [2670/3229] lr: 0.000041 grad_norm: 0.5775 (0.5633) closs: 1.0293 (1.0413) time: 4.0935 data: 0.0002 max mem: 54684 +[06:05:22.161936] Epoch: [1] [2680/3229] lr: 0.000040 grad_norm: 0.5595 (0.5632) closs: 1.0402 (1.0413) time: 4.1174 data: 0.0002 max mem: 54684 +[06:06:02.707096] Epoch: [1] [2690/3229] lr: 0.000040 grad_norm: 0.5369 (0.5631) closs: 1.0413 (1.0413) time: 4.0960 data: 0.0002 max mem: 54684 +[06:06:43.753741] Epoch: [1] [2700/3229] lr: 0.000040 grad_norm: 0.5548 (0.5631) closs: 1.0285 (1.0413) time: 4.0795 data: 0.0002 max mem: 54684 +[06:07:23.992092] Epoch: [1] [2710/3229] lr: 0.000040 grad_norm: 0.5629 (0.5632) closs: 1.0146 (1.0411) time: 4.0642 data: 0.0002 max mem: 54684 +[06:08:05.152339] Epoch: [1] [2720/3229] lr: 0.000040 grad_norm: 0.5591 (0.5632) closs: 1.0251 (1.0410) time: 4.0699 data: 0.0002 max mem: 54684 +[06:08:46.239502] Epoch: [1] [2730/3229] lr: 0.000040 grad_norm: 0.5534 (0.5631) closs: 1.0251 (1.0410) time: 4.1123 data: 0.0002 max mem: 54684 +[06:09:27.133028] Epoch: [1] [2740/3229] lr: 0.000039 grad_norm: 0.5534 (0.5631) closs: 1.0251 (1.0410) time: 4.0990 data: 0.0002 max mem: 54684 +[06:10:07.799998] Epoch: [1] [2750/3229] lr: 0.000039 grad_norm: 0.5710 (0.5630) closs: 1.0134 (1.0409) time: 4.0780 data: 0.0002 max mem: 54684 +[06:10:49.245922] Epoch: [1] [2760/3229] lr: 0.000039 grad_norm: 0.5520 (0.5630) closs: 1.0222 (1.0409) time: 4.1056 data: 0.0002 max mem: 54684 +[06:11:30.008146] Epoch: [1] [2770/3229] lr: 0.000039 grad_norm: 0.5520 (0.5630) closs: 1.0567 (1.0410) time: 4.1103 data: 0.0002 max mem: 54684 +[06:12:11.059504] Epoch: [1] [2780/3229] lr: 0.000039 grad_norm: 0.5508 (0.5629) closs: 1.0601 (1.0410) time: 4.0906 data: 0.0002 max mem: 54684 +[06:12:51.244841] Epoch: [1] [2790/3229] lr: 0.000039 grad_norm: 0.5376 (0.5628) closs: 1.0292 (1.0410) time: 4.0618 data: 0.0002 max mem: 54684 +[06:13:32.442464] Epoch: [1] [2800/3229] lr: 0.000038 grad_norm: 0.5376 (0.5627) closs: 1.0258 (1.0410) time: 4.0691 data: 0.0002 max mem: 54684 +[06:14:12.900525] Epoch: [1] [2810/3229] lr: 0.000038 grad_norm: 0.5395 (0.5626) closs: 1.0165 (1.0408) time: 4.0827 data: 0.0002 max mem: 54684 +[06:14:54.239092] Epoch: [1] [2820/3229] lr: 0.000038 grad_norm: 0.5645 (0.5626) closs: 1.0285 (1.0410) time: 4.0898 data: 0.0002 max mem: 54684 +[06:15:34.778065] Epoch: [1] [2830/3229] lr: 0.000038 grad_norm: 0.5645 (0.5627) closs: 1.0790 (1.0411) time: 4.0938 data: 0.0002 max mem: 54684 +[06:16:15.572869] Epoch: [1] [2840/3229] lr: 0.000038 grad_norm: 0.5464 (0.5627) closs: 1.0548 (1.0411) time: 4.0666 data: 0.0002 max mem: 54684 +[06:16:55.693103] Epoch: [1] [2850/3229] lr: 0.000038 grad_norm: 0.5409 (0.5625) closs: 0.9916 (1.0409) time: 4.0457 data: 0.0002 max mem: 54684 +[06:17:36.370544] Epoch: [1] [2860/3229] lr: 0.000038 grad_norm: 0.5450 (0.5624) closs: 1.0093 (1.0409) time: 4.0398 data: 0.0002 max mem: 54684 +[06:18:16.935227] Epoch: [1] [2870/3229] lr: 0.000037 grad_norm: 0.5436 (0.5623) closs: 1.0331 (1.0408) time: 4.0620 data: 0.0002 max mem: 54684 +[06:18:58.140683] Epoch: [1] [2880/3229] lr: 0.000037 grad_norm: 0.5241 (0.5622) closs: 1.0383 (1.0409) time: 4.0884 data: 0.0002 max mem: 54684 +[06:19:38.588481] Epoch: [1] [2890/3229] lr: 0.000037 grad_norm: 0.5241 (0.5620) closs: 1.0275 (1.0409) time: 4.0826 data: 0.0002 max mem: 54684 +[06:20:19.641183] Epoch: [1] [2900/3229] lr: 0.000037 grad_norm: 0.5396 (0.5620) closs: 0.9919 (1.0407) time: 4.0750 data: 0.0002 max mem: 54684 +[06:21:00.790118] Epoch: [1] [2910/3229] lr: 0.000037 grad_norm: 0.5550 (0.5620) closs: 1.0125 (1.0409) time: 4.1100 data: 0.0002 max mem: 54684 +[06:21:42.244795] Epoch: [1] [2920/3229] lr: 0.000037 grad_norm: 0.5740 (0.5621) closs: 1.0125 (1.0408) time: 4.1301 data: 0.0002 max mem: 54684 +[06:22:22.364951] Epoch: [1] [2930/3229] lr: 0.000036 grad_norm: 0.5768 (0.5620) closs: 1.0067 (1.0407) time: 4.0787 data: 0.0002 max mem: 54684 +[06:23:03.086429] Epoch: [1] [2940/3229] lr: 0.000036 grad_norm: 0.5621 (0.5620) closs: 1.0128 (1.0406) time: 4.0420 data: 0.0002 max mem: 54684 +[06:23:43.579564] Epoch: [1] [2950/3229] lr: 0.000036 grad_norm: 0.5615 (0.5620) closs: 1.0128 (1.0405) time: 4.0607 data: 0.0002 max mem: 54684 +[06:24:24.403706] Epoch: [1] [2960/3229] lr: 0.000036 grad_norm: 0.5591 (0.5619) closs: 1.0059 (1.0404) time: 4.0658 data: 0.0002 max mem: 54684 +[06:25:05.177195] Epoch: [1] [2970/3229] lr: 0.000036 grad_norm: 0.5515 (0.5619) closs: 1.0162 (1.0403) time: 4.0798 data: 0.0002 max mem: 54684 +[06:25:45.583856] Epoch: [1] [2980/3229] lr: 0.000036 grad_norm: 0.5408 (0.5618) closs: 1.0081 (1.0402) time: 4.0589 data: 0.0002 max mem: 54684 +[06:26:26.764185] Epoch: [1] [2990/3229] lr: 0.000036 grad_norm: 0.5408 (0.5618) closs: 1.0228 (1.0403) time: 4.0793 data: 0.0002 max mem: 54684 +[06:27:08.232014] Epoch: [1] [3000/3229] lr: 0.000035 grad_norm: 0.5497 (0.5618) closs: 1.0729 (1.0404) time: 4.1323 data: 0.0002 max mem: 54684 +[06:27:49.342316] Epoch: [1] [3010/3229] lr: 0.000035 grad_norm: 0.5438 (0.5618) closs: 1.0628 (1.0404) time: 4.1288 data: 0.0002 max mem: 54684 +[06:28:30.414609] Epoch: [1] [3020/3229] lr: 0.000035 grad_norm: 0.5503 (0.5618) closs: 1.0482 (1.0403) time: 4.1091 data: 0.0002 max mem: 54684 +[06:29:11.081220] Epoch: [1] [3030/3229] lr: 0.000035 grad_norm: 0.5568 (0.5618) closs: 1.0190 (1.0402) time: 4.0869 data: 0.0002 max mem: 54684 +[06:29:52.489647] Epoch: [1] [3040/3229] lr: 0.000035 grad_norm: 0.5868 (0.5618) closs: 1.0277 (1.0403) time: 4.1037 data: 0.0002 max mem: 54684 +[06:30:33.073169] Epoch: [1] [3050/3229] lr: 0.000035 grad_norm: 0.5631 (0.5618) closs: 1.0394 (1.0403) time: 4.0995 data: 0.0002 max mem: 54684 +[06:31:14.013273] Epoch: [1] [3060/3229] lr: 0.000034 grad_norm: 0.5527 (0.5618) closs: 1.0443 (1.0403) time: 4.0761 data: 0.0002 max mem: 54684 +[06:31:54.597721] Epoch: [1] [3070/3229] lr: 0.000034 grad_norm: 0.5434 (0.5617) closs: 1.0250 (1.0402) time: 4.0762 data: 0.0001 max mem: 54684 +[06:32:35.628475] Epoch: [1] [3080/3229] lr: 0.000034 grad_norm: 0.5593 (0.5617) closs: 1.0317 (1.0403) time: 4.0807 data: 0.0002 max mem: 54684 +[06:33:15.743784] Epoch: [1] [3090/3229] lr: 0.000034 grad_norm: 0.5484 (0.5615) closs: 1.0518 (1.0402) time: 4.0572 data: 0.0002 max mem: 54684 +[06:33:56.228495] Epoch: [1] [3100/3229] lr: 0.000034 grad_norm: 0.5205 (0.5615) closs: 0.9813 (1.0401) time: 4.0299 data: 0.0002 max mem: 54684 +[06:34:37.480013] Epoch: [1] [3110/3229] lr: 0.000034 grad_norm: 0.5548 (0.5615) closs: 1.0324 (1.0402) time: 4.0867 data: 0.0002 max mem: 54684 +[06:35:18.864626] Epoch: [1] [3120/3229] lr: 0.000034 grad_norm: 0.5548 (0.5615) closs: 1.0525 (1.0403) time: 4.1317 data: 0.0002 max mem: 54684 +[06:35:59.777377] Epoch: [1] [3130/3229] lr: 0.000033 grad_norm: 0.5428 (0.5614) closs: 1.0628 (1.0403) time: 4.1148 data: 0.0002 max mem: 54684 +[06:36:40.713675] Epoch: [1] [3140/3229] lr: 0.000033 grad_norm: 0.5397 (0.5614) closs: 1.0246 (1.0402) time: 4.0924 data: 0.0002 max mem: 54684 +[06:37:21.338622] Epoch: [1] [3150/3229] lr: 0.000033 grad_norm: 0.5575 (0.5614) closs: 1.0182 (1.0403) time: 4.0780 data: 0.0002 max mem: 54684 +[06:38:01.899004] Epoch: [1] [3160/3229] lr: 0.000033 grad_norm: 0.5606 (0.5613) closs: 1.0391 (1.0402) time: 4.0592 data: 0.0002 max mem: 54684 +[06:38:42.174203] Epoch: [1] [3170/3229] lr: 0.000033 grad_norm: 0.5272 (0.5613) closs: 0.9944 (1.0401) time: 4.0417 data: 0.0002 max mem: 54684 +[06:39:22.446835] Epoch: [1] [3180/3229] lr: 0.000033 grad_norm: 0.5411 (0.5612) closs: 1.0080 (1.0400) time: 4.0273 data: 0.0002 max mem: 54684 +[06:40:03.306382] Epoch: [1] [3190/3229] lr: 0.000032 grad_norm: 0.5411 (0.5612) closs: 1.0599 (1.0399) time: 4.0565 data: 0.0002 max mem: 54684 +[06:40:43.545456] Epoch: [1] [3200/3229] lr: 0.000032 grad_norm: 0.5426 (0.5612) closs: 1.0677 (1.0399) time: 4.0549 data: 0.0003 max mem: 54684 +[06:41:24.089321] Epoch: [1] [3210/3229] lr: 0.000032 grad_norm: 0.5565 (0.5611) closs: 1.0318 (1.0398) time: 4.0391 data: 0.0003 max mem: 54684 +[06:42:05.015907] Epoch: [1] [3220/3229] lr: 0.000032 grad_norm: 0.5565 (0.5611) closs: 1.0518 (1.0399) time: 4.0735 data: 0.0001 max mem: 54684 +[06:42:37.506577] Epoch: [1] Total time: 3:40:13 +[06:42:37.507422] Averaged stats: lr: 0.000032 grad_norm: 0.5404 (0.5610) closs: 1.0248 (1.0389) +[06:42:37.848477] model saved +[06:42:39.524269] optimizer saved +[06:42:39.524869] other rank-common saved +[06:42:39.529811] rank-specific saved +[06:42:39.543909] log_dir: ./output_dir +[06:42:52.332922] Epoch: [2] [0/3229] lr: 0.000032 grad_norm: 0.6074 (0.6074) closs: 0.9961 (0.9961) time: 12.7882 data: 8.6913 max mem: 54684 +[06:43:33.251059] Epoch: [2] [10/3229] lr: 0.000032 grad_norm: 0.5549 (0.5515) closs: 1.0108 (1.0250) time: 4.8823 data: 0.7903 max mem: 54684 +[06:44:14.062725] Epoch: [2] [20/3229] lr: 0.000032 grad_norm: 0.5604 (0.5634) closs: 1.0132 (1.0331) time: 4.0864 data: 0.0002 max mem: 54684 +[06:44:54.680230] Epoch: [2] [30/3229] lr: 0.000031 grad_norm: 0.5646 (0.5620) closs: 1.0207 (1.0222) time: 4.0714 data: 0.0002 max mem: 54684 +[06:45:36.094248] Epoch: [2] [40/3229] lr: 0.000031 grad_norm: 0.5666 (0.5645) closs: 1.0272 (1.0325) time: 4.1015 data: 0.0002 max mem: 54684 +[06:46:17.586387] Epoch: [2] [50/3229] lr: 0.000031 grad_norm: 0.5721 (0.5678) closs: 1.0488 (1.0357) time: 4.1452 data: 0.0002 max mem: 54684 +[06:46:58.623930] Epoch: [2] [60/3229] lr: 0.000031 grad_norm: 0.5721 (0.5684) closs: 1.0254 (1.0385) time: 4.1264 data: 0.0002 max mem: 54684 +[06:47:38.804594] Epoch: [2] [70/3229] lr: 0.000031 grad_norm: 0.5414 (0.5640) closs: 1.0238 (1.0330) time: 4.0608 data: 0.0002 max mem: 54684 +[06:48:19.867348] Epoch: [2] [80/3229] lr: 0.000031 grad_norm: 0.5383 (0.5644) closs: 1.0562 (1.0336) time: 4.0621 data: 0.0002 max mem: 54684 +[06:49:00.413656] Epoch: [2] [90/3229] lr: 0.000031 grad_norm: 0.5648 (0.5635) closs: 1.0639 (1.0332) time: 4.0804 data: 0.0002 max mem: 54684 +[06:49:41.951060] Epoch: [2] [100/3229] lr: 0.000030 grad_norm: 0.5660 (0.5647) closs: 1.0607 (1.0380) time: 4.1041 data: 0.0002 max mem: 54684 +[06:50:22.979532] Epoch: [2] [110/3229] lr: 0.000030 grad_norm: 0.5676 (0.5650) closs: 1.0512 (1.0370) time: 4.1282 data: 0.0002 max mem: 54684 +[06:51:04.182535] Epoch: [2] [120/3229] lr: 0.000030 grad_norm: 0.5679 (0.5652) closs: 1.0298 (1.0344) time: 4.1115 data: 0.0002 max mem: 54684 +[06:51:45.596931] Epoch: [2] [130/3229] lr: 0.000030 grad_norm: 0.5720 (0.5668) closs: 1.0407 (1.0365) time: 4.1308 data: 0.0002 max mem: 54684 +[06:52:25.981391] Epoch: [2] [140/3229] lr: 0.000030 grad_norm: 0.5643 (0.5656) closs: 1.0547 (1.0368) time: 4.0899 data: 0.0002 max mem: 54684 +[06:53:06.534877] Epoch: [2] [150/3229] lr: 0.000030 grad_norm: 0.5585 (0.5647) closs: 1.0643 (1.0353) time: 4.0468 data: 0.0002 max mem: 54684 +[06:53:47.561712] Epoch: [2] [160/3229] lr: 0.000030 grad_norm: 0.5707 (0.5644) closs: 1.0438 (1.0367) time: 4.0789 data: 0.0002 max mem: 54684 +[06:54:28.735079] Epoch: [2] [170/3229] lr: 0.000029 grad_norm: 0.5660 (0.5639) closs: 1.0206 (1.0351) time: 4.1099 data: 0.0002 max mem: 54684 +[06:55:09.714577] Epoch: [2] [180/3229] lr: 0.000029 grad_norm: 0.5598 (0.5634) closs: 1.0089 (1.0340) time: 4.1076 data: 0.0002 max mem: 54684 +[06:55:50.561442] Epoch: [2] [190/3229] lr: 0.000029 grad_norm: 0.5587 (0.5624) closs: 1.0149 (1.0343) time: 4.0913 data: 0.0002 max mem: 54684 +[06:56:30.629863] Epoch: [2] [200/3229] lr: 0.000029 grad_norm: 0.5501 (0.5617) closs: 1.0149 (1.0314) time: 4.0457 data: 0.0002 max mem: 54684 +[06:57:10.515610] Epoch: [2] [210/3229] lr: 0.000029 grad_norm: 0.5364 (0.5595) closs: 0.9756 (1.0304) time: 3.9976 data: 0.0002 max mem: 54684 +[06:57:51.901450] Epoch: [2] [220/3229] lr: 0.000029 grad_norm: 0.5558 (0.5605) closs: 1.0786 (1.0334) time: 4.0635 data: 0.0002 max mem: 54684 +[06:58:33.059748] Epoch: [2] [230/3229] lr: 0.000029 grad_norm: 0.5713 (0.5605) closs: 1.0664 (1.0330) time: 4.1271 data: 0.0002 max mem: 54684 +[06:59:13.767459] Epoch: [2] [240/3229] lr: 0.000028 grad_norm: 0.5606 (0.5594) closs: 1.0463 (1.0324) time: 4.0932 data: 0.0002 max mem: 54684 +[06:59:54.366626] Epoch: [2] [250/3229] lr: 0.000028 grad_norm: 0.5597 (0.5591) closs: 1.0473 (1.0327) time: 4.0653 data: 0.0002 max mem: 54684 +[07:00:35.448278] Epoch: [2] [260/3229] lr: 0.000028 grad_norm: 0.5615 (0.5590) closs: 1.0473 (1.0337) time: 4.0840 data: 0.0002 max mem: 54684 +[07:01:16.282495] Epoch: [2] [270/3229] lr: 0.000028 grad_norm: 0.5692 (0.5587) closs: 1.0175 (1.0332) time: 4.0957 data: 0.0002 max mem: 54684 +[07:01:57.347029] Epoch: [2] [280/3229] lr: 0.000028 grad_norm: 0.5577 (0.5587) closs: 1.0175 (1.0331) time: 4.0949 data: 0.0002 max mem: 54684 +[07:02:38.198946] Epoch: [2] [290/3229] lr: 0.000028 grad_norm: 0.5458 (0.5578) closs: 1.0612 (1.0344) time: 4.0958 data: 0.0002 max mem: 54684 +[07:03:19.538016] Epoch: [2] [300/3229] lr: 0.000028 grad_norm: 0.5543 (0.5585) closs: 1.0482 (1.0345) time: 4.1095 data: 0.0002 max mem: 54684 +[07:04:00.398339] Epoch: [2] [310/3229] lr: 0.000027 grad_norm: 0.5669 (0.5587) closs: 1.0378 (1.0337) time: 4.1099 data: 0.0002 max mem: 54684 +[07:04:41.202079] Epoch: [2] [320/3229] lr: 0.000027 grad_norm: 0.5484 (0.5583) closs: 1.0331 (1.0335) time: 4.0831 data: 0.0002 max mem: 54684 +[07:05:22.294422] Epoch: [2] [330/3229] lr: 0.000027 grad_norm: 0.5676 (0.5594) closs: 1.0367 (1.0340) time: 4.0947 data: 0.0002 max mem: 54684 +[07:06:02.338520] Epoch: [2] [340/3229] lr: 0.000027 grad_norm: 0.5636 (0.5582) closs: 1.0367 (1.0338) time: 4.0568 data: 0.0002 max mem: 54684 +[07:06:43.190555] Epoch: [2] [350/3229] lr: 0.000027 grad_norm: 0.5512 (0.5587) closs: 1.0080 (1.0331) time: 4.0447 data: 0.0002 max mem: 54684 +[07:07:23.981238] Epoch: [2] [360/3229] lr: 0.000027 grad_norm: 0.5835 (0.5591) closs: 0.9637 (1.0310) time: 4.0821 data: 0.0002 max mem: 54684 +[07:08:04.100084] Epoch: [2] [370/3229] lr: 0.000027 grad_norm: 0.5483 (0.5592) closs: 0.9577 (1.0297) time: 4.0454 data: 0.0002 max mem: 54684 +[07:08:45.172933] Epoch: [2] [380/3229] lr: 0.000026 grad_norm: 0.5730 (0.5599) closs: 1.0405 (1.0300) time: 4.0595 data: 0.0002 max mem: 54684 +[07:09:26.437556] Epoch: [2] [390/3229] lr: 0.000026 grad_norm: 0.5819 (0.5601) closs: 1.0400 (1.0298) time: 4.1168 data: 0.0002 max mem: 54684 +[07:10:06.949904] Epoch: [2] [400/3229] lr: 0.000026 grad_norm: 0.5788 (0.5598) closs: 0.9957 (1.0292) time: 4.0888 data: 0.0002 max mem: 54684 +[07:10:47.514701] Epoch: [2] [410/3229] lr: 0.000026 grad_norm: 0.5763 (0.5596) closs: 1.0425 (1.0290) time: 4.0538 data: 0.0002 max mem: 54684 +[07:11:28.002800] Epoch: [2] [420/3229] lr: 0.000026 grad_norm: 0.5434 (0.5590) closs: 1.0287 (1.0276) time: 4.0526 data: 0.0002 max mem: 54684 +[07:12:08.954598] Epoch: [2] [430/3229] lr: 0.000026 grad_norm: 0.5331 (0.5590) closs: 1.0064 (1.0278) time: 4.0719 data: 0.0002 max mem: 54684 +[07:12:50.182692] Epoch: [2] [440/3229] lr: 0.000026 grad_norm: 0.5630 (0.5593) closs: 1.0357 (1.0279) time: 4.1089 data: 0.0002 max mem: 54684 +[07:13:31.159720] Epoch: [2] [450/3229] lr: 0.000025 grad_norm: 0.5743 (0.5594) closs: 1.0429 (1.0276) time: 4.1102 data: 0.0002 max mem: 54684 +[07:14:12.156189] Epoch: [2] [460/3229] lr: 0.000025 grad_norm: 0.5510 (0.5592) closs: 0.9867 (1.0272) time: 4.0986 data: 0.0002 max mem: 54684 +[07:14:53.144867] Epoch: [2] [470/3229] lr: 0.000025 grad_norm: 0.5739 (0.5619) closs: 1.0229 (1.0268) time: 4.0992 data: 0.0002 max mem: 54684 +[07:15:33.994729] Epoch: [2] [480/3229] lr: 0.000025 grad_norm: 0.5799 (0.5615) closs: 1.0344 (1.0262) time: 4.0919 data: 0.0002 max mem: 54684 +[07:16:14.919123] Epoch: [2] [490/3229] lr: 0.000025 grad_norm: 0.5602 (0.5613) closs: 1.0123 (1.0258) time: 4.0887 data: 0.0002 max mem: 54684 +[07:16:55.482144] Epoch: [2] [500/3229] lr: 0.000025 grad_norm: 0.5650 (0.5613) closs: 1.0011 (1.0257) time: 4.0743 data: 0.0002 max mem: 54684 +[07:17:36.009652] Epoch: [2] [510/3229] lr: 0.000025 grad_norm: 0.5666 (0.5608) closs: 1.0011 (1.0253) time: 4.0545 data: 0.0002 max mem: 54684 +[07:18:17.007812] Epoch: [2] [520/3229] lr: 0.000025 grad_norm: 0.5666 (0.5610) closs: 1.0167 (1.0254) time: 4.0762 data: 0.0002 max mem: 54684 +[07:18:57.928751] Epoch: [2] [530/3229] lr: 0.000024 grad_norm: 0.5620 (0.5609) closs: 1.0517 (1.0253) time: 4.0959 data: 0.0002 max mem: 54684 +[07:19:38.814132] Epoch: [2] [540/3229] lr: 0.000024 grad_norm: 0.5700 (0.5613) closs: 1.0398 (1.0259) time: 4.0903 data: 0.0002 max mem: 54684 +[07:20:19.972410] Epoch: [2] [550/3229] lr: 0.000024 grad_norm: 0.5808 (0.5616) closs: 1.0295 (1.0258) time: 4.1021 data: 0.0002 max mem: 54684 +[07:21:01.186855] Epoch: [2] [560/3229] lr: 0.000024 grad_norm: 0.5597 (0.5613) closs: 1.0380 (1.0259) time: 4.1186 data: 0.0002 max mem: 54684 +[07:21:42.387976] Epoch: [2] [570/3229] lr: 0.000024 grad_norm: 0.5589 (0.5615) closs: 1.0135 (1.0256) time: 4.1207 data: 0.0002 max mem: 54684 +[07:22:23.711719] Epoch: [2] [580/3229] lr: 0.000024 grad_norm: 0.5725 (0.5619) closs: 1.0135 (1.0259) time: 4.1262 data: 0.0002 max mem: 54684 +[07:23:04.803921] Epoch: [2] [590/3229] lr: 0.000024 grad_norm: 0.5810 (0.5624) closs: 1.0295 (1.0264) time: 4.1207 data: 0.0002 max mem: 54684 +[07:23:46.170461] Epoch: [2] [600/3229] lr: 0.000023 grad_norm: 0.5884 (0.5628) closs: 1.0485 (1.0269) time: 4.1229 data: 0.0002 max mem: 54684 +[07:24:27.382259] Epoch: [2] [610/3229] lr: 0.000023 grad_norm: 0.5673 (0.5631) closs: 1.0487 (1.0275) time: 4.1289 data: 0.0002 max mem: 54684 +[07:25:08.711019] Epoch: [2] [620/3229] lr: 0.000023 grad_norm: 0.5802 (0.5636) closs: 1.0487 (1.0279) time: 4.1270 data: 0.0002 max mem: 54684 +[07:25:49.496061] Epoch: [2] [630/3229] lr: 0.000023 grad_norm: 0.5873 (0.5637) closs: 1.0284 (1.0272) time: 4.1056 data: 0.0002 max mem: 54684 +[07:26:30.840270] Epoch: [2] [640/3229] lr: 0.000023 grad_norm: 0.5644 (0.5637) closs: 1.0242 (1.0272) time: 4.1064 data: 0.0002 max mem: 54684 +[07:27:11.536611] Epoch: [2] [650/3229] lr: 0.000023 grad_norm: 0.5738 (0.5637) closs: 1.0264 (1.0271) time: 4.1020 data: 0.0002 max mem: 54684 +[07:27:52.721716] Epoch: [2] [660/3229] lr: 0.000023 grad_norm: 0.5873 (0.5642) closs: 1.0264 (1.0268) time: 4.0940 data: 0.0002 max mem: 54684 +[07:28:33.018219] Epoch: [2] [670/3229] lr: 0.000023 grad_norm: 0.5817 (0.5643) closs: 0.9670 (1.0256) time: 4.0740 data: 0.0002 max mem: 54684 +[07:29:13.959783] Epoch: [2] [680/3229] lr: 0.000022 grad_norm: 0.5840 (0.5644) closs: 1.0050 (1.0255) time: 4.0618 data: 0.0002 max mem: 54684 +[07:29:54.691528] Epoch: [2] [690/3229] lr: 0.000022 grad_norm: 0.5650 (0.5646) closs: 1.0360 (1.0252) time: 4.0836 data: 0.0002 max mem: 54684 +[07:30:35.214294] Epoch: [2] [700/3229] lr: 0.000022 grad_norm: 0.5549 (0.5646) closs: 0.9915 (1.0249) time: 4.0627 data: 0.0002 max mem: 54684 +[07:31:15.579049] Epoch: [2] [710/3229] lr: 0.000022 grad_norm: 0.5666 (0.5645) closs: 0.9990 (1.0244) time: 4.0443 data: 0.0002 max mem: 54684 +[07:31:56.182963] Epoch: [2] [720/3229] lr: 0.000022 grad_norm: 0.5709 (0.5647) closs: 1.0062 (1.0239) time: 4.0484 data: 0.0002 max mem: 54684 +[07:32:36.940307] Epoch: [2] [730/3229] lr: 0.000022 grad_norm: 0.5456 (0.5644) closs: 1.0022 (1.0242) time: 4.0680 data: 0.0002 max mem: 54684 +[07:33:17.790404] Epoch: [2] [740/3229] lr: 0.000022 grad_norm: 0.5405 (0.5641) closs: 1.0311 (1.0242) time: 4.0803 data: 0.0002 max mem: 54684 +[07:33:58.876257] Epoch: [2] [750/3229] lr: 0.000022 grad_norm: 0.5392 (0.5636) closs: 1.0301 (1.0240) time: 4.0967 data: 0.0002 max mem: 54684 +[07:34:39.404388] Epoch: [2] [760/3229] lr: 0.000021 grad_norm: 0.5370 (0.5635) closs: 0.9803 (1.0237) time: 4.0806 data: 0.0002 max mem: 54684 +[07:35:20.103499] Epoch: [2] [770/3229] lr: 0.000021 grad_norm: 0.5533 (0.5636) closs: 0.9830 (1.0234) time: 4.0613 data: 0.0002 max mem: 54684 +[07:36:01.311015] Epoch: [2] [780/3229] lr: 0.000021 grad_norm: 0.5643 (0.5637) closs: 1.0087 (1.0238) time: 4.0953 data: 0.0002 max mem: 54684 +[07:36:42.010206] Epoch: [2] [790/3229] lr: 0.000021 grad_norm: 0.5717 (0.5642) closs: 1.0332 (1.0232) time: 4.0953 data: 0.0002 max mem: 54684 +[07:37:22.202073] Epoch: [2] [800/3229] lr: 0.000021 grad_norm: 0.5769 (0.5641) closs: 1.0252 (1.0229) time: 4.0445 data: 0.0002 max mem: 54684 +[07:38:03.544700] Epoch: [2] [810/3229] lr: 0.000021 grad_norm: 0.5595 (0.5642) closs: 1.0364 (1.0230) time: 4.0767 data: 0.0002 max mem: 54684 +[07:38:44.043917] Epoch: [2] [820/3229] lr: 0.000021 grad_norm: 0.5705 (0.5643) closs: 1.0233 (1.0226) time: 4.0920 data: 0.0002 max mem: 54684 +[07:39:24.427927] Epoch: [2] [830/3229] lr: 0.000021 grad_norm: 0.5705 (0.5641) closs: 1.0029 (1.0221) time: 4.0441 data: 0.0002 max mem: 54684 +[07:40:05.213763] Epoch: [2] [840/3229] lr: 0.000020 grad_norm: 0.5476 (0.5641) closs: 1.0162 (1.0224) time: 4.0584 data: 0.0002 max mem: 54684 +[07:40:46.257738] Epoch: [2] [850/3229] lr: 0.000020 grad_norm: 0.5579 (0.5643) closs: 1.0359 (1.0226) time: 4.0914 data: 0.0002 max mem: 54684 +[07:41:27.073907] Epoch: [2] [860/3229] lr: 0.000020 grad_norm: 0.5654 (0.5644) closs: 1.0328 (1.0223) time: 4.0929 data: 0.0002 max mem: 54684 +[07:42:08.541626] Epoch: [2] [870/3229] lr: 0.000020 grad_norm: 0.5884 (0.5648) closs: 1.0390 (1.0226) time: 4.1141 data: 0.0002 max mem: 54684 +[07:42:49.010976] Epoch: [2] [880/3229] lr: 0.000020 grad_norm: 0.5884 (0.5647) closs: 1.0361 (1.0224) time: 4.0968 data: 0.0002 max mem: 54684 +[07:43:30.376441] Epoch: [2] [890/3229] lr: 0.000020 grad_norm: 0.5687 (0.5648) closs: 1.0268 (1.0226) time: 4.0917 data: 0.0002 max mem: 54684 +[07:44:11.214842] Epoch: [2] [900/3229] lr: 0.000020 grad_norm: 0.5649 (0.5650) closs: 1.0391 (1.0227) time: 4.1101 data: 0.0002 max mem: 54684 +[07:44:52.760157] Epoch: [2] [910/3229] lr: 0.000020 grad_norm: 0.5771 (0.5651) closs: 1.0420 (1.0231) time: 4.1191 data: 0.0002 max mem: 54684 +[07:45:33.530222] Epoch: [2] [920/3229] lr: 0.000019 grad_norm: 0.5793 (0.5652) closs: 1.0535 (1.0232) time: 4.1157 data: 0.0002 max mem: 54684 +[07:46:14.506847] Epoch: [2] [930/3229] lr: 0.000019 grad_norm: 0.5620 (0.5652) closs: 1.0553 (1.0238) time: 4.0873 data: 0.0002 max mem: 54684 +[07:46:55.368426] Epoch: [2] [940/3229] lr: 0.000019 grad_norm: 0.5562 (0.5652) closs: 1.0217 (1.0234) time: 4.0918 data: 0.0002 max mem: 54684 +[07:47:36.461145] Epoch: [2] [950/3229] lr: 0.000019 grad_norm: 0.5638 (0.5654) closs: 1.0119 (1.0236) time: 4.0976 data: 0.0002 max mem: 54684 +[07:48:17.376220] Epoch: [2] [960/3229] lr: 0.000019 grad_norm: 0.5859 (0.5654) closs: 1.0316 (1.0237) time: 4.1003 data: 0.0002 max mem: 54684 +[07:48:58.590641] Epoch: [2] [970/3229] lr: 0.000019 grad_norm: 0.5773 (0.5656) closs: 1.0316 (1.0239) time: 4.1064 data: 0.0002 max mem: 54684 +[07:49:39.107966] Epoch: [2] [980/3229] lr: 0.000019 grad_norm: 0.5773 (0.5657) closs: 1.0133 (1.0236) time: 4.0865 data: 0.0002 max mem: 54684 +[07:50:19.601649] Epoch: [2] [990/3229] lr: 0.000019 grad_norm: 0.5666 (0.5655) closs: 1.0012 (1.0233) time: 4.0505 data: 0.0002 max mem: 54684 +[07:51:00.873589] Epoch: [2] [1000/3229] lr: 0.000019 grad_norm: 0.5538 (0.5656) closs: 1.0434 (1.0238) time: 4.0882 data: 0.0002 max mem: 54684 +[07:51:41.761900] Epoch: [2] [1010/3229] lr: 0.000018 grad_norm: 0.5582 (0.5656) closs: 1.0501 (1.0241) time: 4.1079 data: 0.0002 max mem: 54684 +[07:52:21.925593] Epoch: [2] [1020/3229] lr: 0.000018 grad_norm: 0.5700 (0.5658) closs: 1.0206 (1.0236) time: 4.0525 data: 0.0002 max mem: 54684 +[07:53:02.807691] Epoch: [2] [1030/3229] lr: 0.000018 grad_norm: 0.5521 (0.5656) closs: 1.0060 (1.0235) time: 4.0522 data: 0.0002 max mem: 54684 +[07:53:43.361018] Epoch: [2] [1040/3229] lr: 0.000018 grad_norm: 0.5521 (0.5656) closs: 0.9824 (1.0230) time: 4.0717 data: 0.0002 max mem: 54684 +[07:54:24.671047] Epoch: [2] [1050/3229] lr: 0.000018 grad_norm: 0.5499 (0.5655) closs: 1.0254 (1.0232) time: 4.0931 data: 0.0002 max mem: 54684 +[07:55:05.437645] Epoch: [2] [1060/3229] lr: 0.000018 grad_norm: 0.5612 (0.5655) closs: 1.0412 (1.0233) time: 4.1038 data: 0.0002 max mem: 54684 +[07:55:46.843185] Epoch: [2] [1070/3229] lr: 0.000018 grad_norm: 0.5612 (0.5653) closs: 1.0080 (1.0233) time: 4.1085 data: 0.0002 max mem: 54684 +[07:56:27.719997] Epoch: [2] [1080/3229] lr: 0.000018 grad_norm: 0.5554 (0.5654) closs: 1.0295 (1.0234) time: 4.1140 data: 0.0002 max mem: 54684 +[07:57:08.643703] Epoch: [2] [1090/3229] lr: 0.000018 grad_norm: 0.5717 (0.5654) closs: 1.0642 (1.0237) time: 4.0900 data: 0.0002 max mem: 54684 +[07:57:49.855063] Epoch: [2] [1100/3229] lr: 0.000017 grad_norm: 0.5673 (0.5654) closs: 1.0575 (1.0238) time: 4.1067 data: 0.0002 max mem: 54684 +[07:58:31.249901] Epoch: [2] [1110/3229] lr: 0.000017 grad_norm: 0.5670 (0.5656) closs: 1.0418 (1.0240) time: 4.1302 data: 0.0002 max mem: 54684 +[07:59:12.171662] Epoch: [2] [1120/3229] lr: 0.000017 grad_norm: 0.5640 (0.5655) closs: 1.0369 (1.0238) time: 4.1158 data: 0.0002 max mem: 54684 +[07:59:53.140104] Epoch: [2] [1130/3229] lr: 0.000017 grad_norm: 0.5731 (0.5658) closs: 1.0386 (1.0239) time: 4.0944 data: 0.0002 max mem: 54684 +[08:00:32.932831] Epoch: [2] [1140/3229] lr: 0.000017 grad_norm: 0.6018 (0.5660) closs: 0.9926 (1.0234) time: 4.0380 data: 0.0002 max mem: 54684 +[08:01:14.046816] Epoch: [2] [1150/3229] lr: 0.000017 grad_norm: 0.5912 (0.5662) closs: 0.9690 (1.0232) time: 4.0453 data: 0.0002 max mem: 54684 +[08:01:55.095228] Epoch: [2] [1160/3229] lr: 0.000017 grad_norm: 0.6100 (0.5664) closs: 1.0358 (1.0234) time: 4.1081 data: 0.0002 max mem: 54684 +[08:02:35.605443] Epoch: [2] [1170/3229] lr: 0.000017 grad_norm: 0.6100 (0.5663) closs: 1.0243 (1.0233) time: 4.0779 data: 0.0002 max mem: 54684 +[08:03:16.565669] Epoch: [2] [1180/3229] lr: 0.000017 grad_norm: 0.5632 (0.5663) closs: 1.0243 (1.0233) time: 4.0735 data: 0.0002 max mem: 54684 +[08:03:57.204562] Epoch: [2] [1190/3229] lr: 0.000016 grad_norm: 0.5631 (0.5662) closs: 1.0358 (1.0235) time: 4.0799 data: 0.0002 max mem: 54684 +[08:04:38.643319] Epoch: [2] [1200/3229] lr: 0.000016 grad_norm: 0.5646 (0.5663) closs: 1.0465 (1.0236) time: 4.1038 data: 0.0002 max mem: 54684 +[08:05:19.875642] Epoch: [2] [1210/3229] lr: 0.000016 grad_norm: 0.5646 (0.5663) closs: 1.0653 (1.0240) time: 4.1335 data: 0.0002 max mem: 54684 +[08:06:00.968858] Epoch: [2] [1220/3229] lr: 0.000016 grad_norm: 0.5569 (0.5661) closs: 1.0412 (1.0239) time: 4.1162 data: 0.0002 max mem: 54684 +[08:06:42.285568] Epoch: [2] [1230/3229] lr: 0.000016 grad_norm: 0.5760 (0.5663) closs: 1.0333 (1.0241) time: 4.1204 data: 0.0002 max mem: 54684 +[08:07:23.000807] Epoch: [2] [1240/3229] lr: 0.000016 grad_norm: 0.5760 (0.5662) closs: 1.0333 (1.0241) time: 4.1015 data: 0.0002 max mem: 54684 +[08:08:04.264643] Epoch: [2] [1250/3229] lr: 0.000016 grad_norm: 0.5775 (0.5664) closs: 1.0675 (1.0246) time: 4.0989 data: 0.0002 max mem: 54684 +[08:08:44.969858] Epoch: [2] [1260/3229] lr: 0.000016 grad_norm: 0.5808 (0.5663) closs: 1.0700 (1.0245) time: 4.0984 data: 0.0002 max mem: 54684 +[08:09:25.973545] Epoch: [2] [1270/3229] lr: 0.000016 grad_norm: 0.5808 (0.5664) closs: 1.0291 (1.0246) time: 4.0854 data: 0.0002 max mem: 54684 +[08:10:06.727519] Epoch: [2] [1280/3229] lr: 0.000015 grad_norm: 0.5845 (0.5664) closs: 1.0281 (1.0244) time: 4.0878 data: 0.0002 max mem: 54684 +[08:10:47.690909] Epoch: [2] [1290/3229] lr: 0.000015 grad_norm: 0.5669 (0.5663) closs: 1.0443 (1.0248) time: 4.0858 data: 0.0002 max mem: 54684 +[08:11:28.781544] Epoch: [2] [1300/3229] lr: 0.000015 grad_norm: 0.5669 (0.5663) closs: 1.0462 (1.0250) time: 4.1026 data: 0.0002 max mem: 54684 +[08:12:09.710793] Epoch: [2] [1310/3229] lr: 0.000015 grad_norm: 0.5716 (0.5664) closs: 1.0190 (1.0248) time: 4.1009 data: 0.0002 max mem: 54684 +[08:12:50.835464] Epoch: [2] [1320/3229] lr: 0.000015 grad_norm: 0.5617 (0.5662) closs: 1.0010 (1.0248) time: 4.1026 data: 0.0002 max mem: 54684 +[08:13:31.399909] Epoch: [2] [1330/3229] lr: 0.000015 grad_norm: 0.5617 (0.5662) closs: 1.0131 (1.0245) time: 4.0844 data: 0.0002 max mem: 54684 +[08:14:11.778149] Epoch: [2] [1340/3229] lr: 0.000015 grad_norm: 0.5725 (0.5661) closs: 0.9745 (1.0241) time: 4.0471 data: 0.0002 max mem: 54684 +[08:14:52.478894] Epoch: [2] [1350/3229] lr: 0.000015 grad_norm: 0.5570 (0.5660) closs: 0.9745 (1.0238) time: 4.0539 data: 0.0002 max mem: 54684 +[08:15:33.588932] Epoch: [2] [1360/3229] lr: 0.000015 grad_norm: 0.5678 (0.5661) closs: 0.9983 (1.0241) time: 4.0905 data: 0.0002 max mem: 54684 +[08:16:14.787838] Epoch: [2] [1370/3229] lr: 0.000015 grad_norm: 0.5731 (0.5662) closs: 1.0447 (1.0241) time: 4.1154 data: 0.0002 max mem: 54684 +[08:16:55.897936] Epoch: [2] [1380/3229] lr: 0.000014 grad_norm: 0.5857 (0.5663) closs: 1.0447 (1.0246) time: 4.1154 data: 0.0002 max mem: 54684 +[08:17:36.835149] Epoch: [2] [1390/3229] lr: 0.000014 grad_norm: 0.5857 (0.5664) closs: 1.0514 (1.0247) time: 4.1023 data: 0.0002 max mem: 54684 +[08:18:18.203506] Epoch: [2] [1400/3229] lr: 0.000014 grad_norm: 0.5794 (0.5665) closs: 1.0346 (1.0247) time: 4.1152 data: 0.0002 max mem: 54684 +[08:18:58.505738] Epoch: [2] [1410/3229] lr: 0.000014 grad_norm: 0.5650 (0.5664) closs: 1.0340 (1.0245) time: 4.0835 data: 0.0002 max mem: 54684 +[08:19:39.586158] Epoch: [2] [1420/3229] lr: 0.000014 grad_norm: 0.5650 (0.5665) closs: 1.0154 (1.0246) time: 4.0691 data: 0.0002 max mem: 54684 +[08:20:20.294950] Epoch: [2] [1430/3229] lr: 0.000014 grad_norm: 0.5863 (0.5666) closs: 1.0423 (1.0247) time: 4.0894 data: 0.0002 max mem: 54684 +[08:21:01.589354] Epoch: [2] [1440/3229] lr: 0.000014 grad_norm: 0.5637 (0.5665) closs: 1.0237 (1.0245) time: 4.1001 data: 0.0002 max mem: 54684 +[08:21:42.141787] Epoch: [2] [1450/3229] lr: 0.000014 grad_norm: 0.5447 (0.5664) closs: 1.0227 (1.0243) time: 4.0923 data: 0.0002 max mem: 54684 +[08:22:22.256203] Epoch: [2] [1460/3229] lr: 0.000014 grad_norm: 0.5513 (0.5663) closs: 0.9757 (1.0240) time: 4.0333 data: 0.0002 max mem: 54684 +[08:23:03.179340] Epoch: [2] [1470/3229] lr: 0.000014 grad_norm: 0.5653 (0.5663) closs: 0.9796 (1.0239) time: 4.0518 data: 0.0002 max mem: 54684 +[08:23:44.284317] Epoch: [2] [1480/3229] lr: 0.000014 grad_norm: 0.5706 (0.5664) closs: 1.0076 (1.0239) time: 4.1013 data: 0.0002 max mem: 54684 +[08:24:25.519566] Epoch: [2] [1490/3229] lr: 0.000013 grad_norm: 0.5768 (0.5664) closs: 1.0317 (1.0240) time: 4.1169 data: 0.0002 max mem: 54684 +[08:25:06.727554] Epoch: [2] [1500/3229] lr: 0.000013 grad_norm: 0.5675 (0.5664) closs: 1.0317 (1.0240) time: 4.1221 data: 0.0002 max mem: 54684 +[08:25:47.335026] Epoch: [2] [1510/3229] lr: 0.000013 grad_norm: 0.5676 (0.5664) closs: 1.0193 (1.0239) time: 4.0907 data: 0.0002 max mem: 54684 +[08:26:28.009053] Epoch: [2] [1520/3229] lr: 0.000013 grad_norm: 0.5730 (0.5664) closs: 1.0532 (1.0241) time: 4.0640 data: 0.0002 max mem: 54684 +[08:27:08.283681] Epoch: [2] [1530/3229] lr: 0.000013 grad_norm: 0.5565 (0.5662) closs: 1.0299 (1.0239) time: 4.0474 data: 0.0002 max mem: 54684 +[08:27:49.232202] Epoch: [2] [1540/3229] lr: 0.000013 grad_norm: 0.5565 (0.5661) closs: 1.0269 (1.0240) time: 4.0611 data: 0.0002 max mem: 54684 +[08:28:30.229388] Epoch: [2] [1550/3229] lr: 0.000013 grad_norm: 0.5616 (0.5662) closs: 1.0446 (1.0241) time: 4.0972 data: 0.0002 max mem: 54684 +[08:29:11.247931] Epoch: [2] [1560/3229] lr: 0.000013 grad_norm: 0.5649 (0.5661) closs: 1.0224 (1.0240) time: 4.1007 data: 0.0002 max mem: 54684 +[08:29:51.889980] Epoch: [2] [1570/3229] lr: 0.000013 grad_norm: 0.5717 (0.5662) closs: 1.0066 (1.0243) time: 4.0830 data: 0.0002 max mem: 54684 +[08:30:32.727479] Epoch: [2] [1580/3229] lr: 0.000013 grad_norm: 0.5624 (0.5661) closs: 1.0446 (1.0241) time: 4.0739 data: 0.0002 max mem: 54684 +[08:31:13.949901] Epoch: [2] [1590/3229] lr: 0.000012 grad_norm: 0.5624 (0.5662) closs: 0.9921 (1.0243) time: 4.1029 data: 0.0002 max mem: 54684 +[08:31:54.201566] Epoch: [2] [1600/3229] lr: 0.000012 grad_norm: 0.5771 (0.5663) closs: 0.9877 (1.0241) time: 4.0736 data: 0.0002 max mem: 54684 +[08:32:34.735954] Epoch: [2] [1610/3229] lr: 0.000012 grad_norm: 0.5812 (0.5664) closs: 0.9910 (1.0241) time: 4.0392 data: 0.0002 max mem: 54684 +[08:33:16.428448] Epoch: [2] [1620/3229] lr: 0.000012 grad_norm: 0.5737 (0.5665) closs: 1.0327 (1.0241) time: 4.1113 data: 0.0002 max mem: 54684 +[08:33:57.468499] Epoch: [2] [1630/3229] lr: 0.000012 grad_norm: 0.5634 (0.5664) closs: 1.0435 (1.0241) time: 4.1366 data: 0.0002 max mem: 54684 +[08:34:38.562935] Epoch: [2] [1640/3229] lr: 0.000012 grad_norm: 0.5634 (0.5664) closs: 1.0311 (1.0243) time: 4.1067 data: 0.0002 max mem: 54684 +[08:35:19.378294] Epoch: [2] [1650/3229] lr: 0.000012 grad_norm: 0.5610 (0.5663) closs: 1.0311 (1.0242) time: 4.0954 data: 0.0002 max mem: 54684 +[08:36:00.271634] Epoch: [2] [1660/3229] lr: 0.000012 grad_norm: 0.5621 (0.5664) closs: 0.9766 (1.0241) time: 4.0854 data: 0.0002 max mem: 54684 +[08:36:41.474243] Epoch: [2] [1670/3229] lr: 0.000012 grad_norm: 0.5854 (0.5666) closs: 1.0585 (1.0243) time: 4.1047 data: 0.0002 max mem: 54684 +[08:37:22.515951] Epoch: [2] [1680/3229] lr: 0.000012 grad_norm: 0.5781 (0.5666) closs: 1.0171 (1.0241) time: 4.1122 data: 0.0002 max mem: 54684 +[08:38:02.968774] Epoch: [2] [1690/3229] lr: 0.000012 grad_norm: 0.5756 (0.5666) closs: 1.0070 (1.0242) time: 4.0747 data: 0.0002 max mem: 54684 +[08:38:44.713956] Epoch: [2] [1700/3229] lr: 0.000012 grad_norm: 0.5763 (0.5666) closs: 1.0158 (1.0242) time: 4.1098 data: 0.0002 max mem: 54684 +[08:39:25.060084] Epoch: [2] [1710/3229] lr: 0.000011 grad_norm: 0.5616 (0.5665) closs: 1.0406 (1.0243) time: 4.1045 data: 0.0002 max mem: 54684 +[08:40:05.596502] Epoch: [2] [1720/3229] lr: 0.000011 grad_norm: 0.5514 (0.5665) closs: 1.0451 (1.0243) time: 4.0441 data: 0.0002 max mem: 54684 +[08:40:46.724610] Epoch: [2] [1730/3229] lr: 0.000011 grad_norm: 0.5627 (0.5665) closs: 1.0642 (1.0244) time: 4.0832 data: 0.0002 max mem: 54684 +[08:41:27.461413] Epoch: [2] [1740/3229] lr: 0.000011 grad_norm: 0.5660 (0.5666) closs: 1.0424 (1.0243) time: 4.0932 data: 0.0002 max mem: 54684 +[08:42:08.157448] Epoch: [2] [1750/3229] lr: 0.000011 grad_norm: 0.5676 (0.5665) closs: 1.0295 (1.0244) time: 4.0716 data: 0.0002 max mem: 54684 +[08:42:48.966233] Epoch: [2] [1760/3229] lr: 0.000011 grad_norm: 0.5615 (0.5666) closs: 1.0657 (1.0245) time: 4.0752 data: 0.0002 max mem: 54684 +[08:43:30.179827] Epoch: [2] [1770/3229] lr: 0.000011 grad_norm: 0.5615 (0.5667) closs: 1.0207 (1.0247) time: 4.1010 data: 0.0002 max mem: 54684 +[08:44:11.623138] Epoch: [2] [1780/3229] lr: 0.000011 grad_norm: 0.5781 (0.5668) closs: 1.0357 (1.0247) time: 4.1328 data: 0.0002 max mem: 54684 +[08:44:52.356211] Epoch: [2] [1790/3229] lr: 0.000011 grad_norm: 0.5796 (0.5668) closs: 1.0357 (1.0246) time: 4.1088 data: 0.0002 max mem: 54684 +[08:45:33.510164] Epoch: [2] [1800/3229] lr: 0.000011 grad_norm: 0.5773 (0.5670) closs: 1.0281 (1.0247) time: 4.0943 data: 0.0002 max mem: 54684 +[08:46:14.313566] Epoch: [2] [1810/3229] lr: 0.000011 grad_norm: 0.5773 (0.5670) closs: 1.0281 (1.0246) time: 4.0978 data: 0.0002 max mem: 54684 +[08:46:55.460563] Epoch: [2] [1820/3229] lr: 0.000011 grad_norm: 0.5854 (0.5671) closs: 1.0385 (1.0247) time: 4.0975 data: 0.0002 max mem: 54684 +[08:47:36.509580] Epoch: [2] [1830/3229] lr: 0.000011 grad_norm: 0.5626 (0.5671) closs: 1.0241 (1.0246) time: 4.1097 data: 0.0002 max mem: 54684 +[08:48:17.677647] Epoch: [2] [1840/3229] lr: 0.000010 grad_norm: 0.5677 (0.5672) closs: 1.0266 (1.0247) time: 4.1108 data: 0.0002 max mem: 54684 +[08:48:57.971461] Epoch: [2] [1850/3229] lr: 0.000010 grad_norm: 0.5741 (0.5672) closs: 1.0266 (1.0246) time: 4.0730 data: 0.0002 max mem: 54684 +[08:49:38.910848] Epoch: [2] [1860/3229] lr: 0.000010 grad_norm: 0.5630 (0.5673) closs: 1.0133 (1.0246) time: 4.0616 data: 0.0002 max mem: 54684 +[08:50:19.579669] Epoch: [2] [1870/3229] lr: 0.000010 grad_norm: 0.5608 (0.5674) closs: 1.0030 (1.0243) time: 4.0803 data: 0.0002 max mem: 54684 +[08:51:00.472658] Epoch: [2] [1880/3229] lr: 0.000010 grad_norm: 0.5605 (0.5673) closs: 1.0132 (1.0243) time: 4.0780 data: 0.0002 max mem: 54684 +[08:51:41.817338] Epoch: [2] [1890/3229] lr: 0.000010 grad_norm: 0.5564 (0.5675) closs: 1.0360 (1.0245) time: 4.1118 data: 0.0002 max mem: 54684 +[08:52:22.696669] Epoch: [2] [1900/3229] lr: 0.000010 grad_norm: 0.5788 (0.5675) closs: 1.0678 (1.0246) time: 4.1111 data: 0.0002 max mem: 54684 +[08:53:04.048673] Epoch: [2] [1910/3229] lr: 0.000010 grad_norm: 0.5831 (0.5676) closs: 1.0454 (1.0247) time: 4.1115 data: 0.0002 max mem: 54684 +[08:53:44.004385] Epoch: [2] [1920/3229] lr: 0.000010 grad_norm: 0.5480 (0.5675) closs: 1.0063 (1.0245) time: 4.0653 data: 0.0002 max mem: 54684 +[08:54:24.721597] Epoch: [2] [1930/3229] lr: 0.000010 grad_norm: 0.5612 (0.5676) closs: 0.9927 (1.0243) time: 4.0336 data: 0.0002 max mem: 54684 +[08:55:04.941963] Epoch: [2] [1940/3229] lr: 0.000010 grad_norm: 0.5813 (0.5675) closs: 1.0320 (1.0241) time: 4.0468 data: 0.0002 max mem: 54684 +[08:55:45.992823] Epoch: [2] [1950/3229] lr: 0.000010 grad_norm: 0.5605 (0.5675) closs: 1.0352 (1.0240) time: 4.0635 data: 0.0002 max mem: 54684 +[08:56:26.486944] Epoch: [2] [1960/3229] lr: 0.000010 grad_norm: 0.5619 (0.5674) closs: 1.0194 (1.0241) time: 4.0772 data: 0.0002 max mem: 54684 +[08:57:07.906961] Epoch: [2] [1970/3229] lr: 0.000009 grad_norm: 0.5508 (0.5674) closs: 1.0129 (1.0241) time: 4.0956 data: 0.0002 max mem: 54684 +[08:57:49.115637] Epoch: [2] [1980/3229] lr: 0.000009 grad_norm: 0.5616 (0.5675) closs: 1.0654 (1.0244) time: 4.1314 data: 0.0002 max mem: 54684 +[08:58:30.163889] Epoch: [2] [1990/3229] lr: 0.000009 grad_norm: 0.5789 (0.5676) closs: 1.0512 (1.0244) time: 4.1128 data: 0.0002 max mem: 54684 +[08:59:11.320270] Epoch: [2] [2000/3229] lr: 0.000009 grad_norm: 0.5798 (0.5678) closs: 1.0228 (1.0245) time: 4.1102 data: 0.0002 max mem: 54684 +[08:59:52.707044] Epoch: [2] [2010/3229] lr: 0.000009 grad_norm: 0.5890 (0.5679) closs: 1.0398 (1.0246) time: 4.1271 data: 0.0002 max mem: 54684 +[09:00:34.019560] Epoch: [2] [2020/3229] lr: 0.000009 grad_norm: 0.5755 (0.5679) closs: 1.0402 (1.0248) time: 4.1349 data: 0.0002 max mem: 54684 +[09:01:14.914969] Epoch: [2] [2030/3229] lr: 0.000009 grad_norm: 0.5881 (0.5682) closs: 1.0402 (1.0248) time: 4.1103 data: 0.0002 max mem: 54684 +[09:01:56.056629] Epoch: [2] [2040/3229] lr: 0.000009 grad_norm: 0.5943 (0.5683) closs: 1.0367 (1.0249) time: 4.1018 data: 0.0002 max mem: 54684 +[09:02:36.707902] Epoch: [2] [2050/3229] lr: 0.000009 grad_norm: 0.5845 (0.5684) closs: 1.0345 (1.0248) time: 4.0896 data: 0.0002 max mem: 54684 +[09:03:17.997205] Epoch: [2] [2060/3229] lr: 0.000009 grad_norm: 0.5942 (0.5686) closs: 1.0345 (1.0249) time: 4.0970 data: 0.0002 max mem: 54684 +[09:03:58.576728] Epoch: [2] [2070/3229] lr: 0.000009 grad_norm: 0.5693 (0.5684) closs: 0.9970 (1.0246) time: 4.0934 data: 0.0002 max mem: 54684 +[09:04:39.774170] Epoch: [2] [2080/3229] lr: 0.000009 grad_norm: 0.5613 (0.5685) closs: 1.0260 (1.0248) time: 4.0888 data: 0.0002 max mem: 54684 +[09:05:20.813724] Epoch: [2] [2090/3229] lr: 0.000009 grad_norm: 0.5638 (0.5684) closs: 1.0313 (1.0248) time: 4.1118 data: 0.0002 max mem: 54684 +[09:06:01.815018] Epoch: [2] [2100/3229] lr: 0.000009 grad_norm: 0.5638 (0.5684) closs: 1.0313 (1.0249) time: 4.1020 data: 0.0002 max mem: 54684 +[09:06:41.722289] Epoch: [2] [2110/3229] lr: 0.000009 grad_norm: 0.5757 (0.5684) closs: 1.0256 (1.0247) time: 4.0454 data: 0.0002 max mem: 54684 +[09:07:22.224103] Epoch: [2] [2120/3229] lr: 0.000008 grad_norm: 0.5549 (0.5683) closs: 1.0256 (1.0247) time: 4.0204 data: 0.0002 max mem: 54684 +[09:08:02.949927] Epoch: [2] [2130/3229] lr: 0.000008 grad_norm: 0.5599 (0.5683) closs: 1.0180 (1.0246) time: 4.0613 data: 0.0002 max mem: 54684 +[09:08:43.598145] Epoch: [2] [2140/3229] lr: 0.000008 grad_norm: 0.5644 (0.5682) closs: 1.0041 (1.0246) time: 4.0686 data: 0.0002 max mem: 54684 +[09:09:24.526465] Epoch: [2] [2150/3229] lr: 0.000008 grad_norm: 0.5686 (0.5683) closs: 1.0608 (1.0248) time: 4.0788 data: 0.0002 max mem: 54684 +[09:10:05.683146] Epoch: [2] [2160/3229] lr: 0.000008 grad_norm: 0.5769 (0.5684) closs: 1.0634 (1.0250) time: 4.1042 data: 0.0002 max mem: 54684 +[09:10:46.714264] Epoch: [2] [2170/3229] lr: 0.000008 grad_norm: 0.5744 (0.5684) closs: 1.0447 (1.0250) time: 4.1093 data: 0.0002 max mem: 54684 +[09:11:27.676475] Epoch: [2] [2180/3229] lr: 0.000008 grad_norm: 0.5801 (0.5686) closs: 1.0257 (1.0251) time: 4.0996 data: 0.0002 max mem: 54684 +[09:12:08.191970] Epoch: [2] [2190/3229] lr: 0.000008 grad_norm: 0.5781 (0.5687) closs: 1.0314 (1.0250) time: 4.0738 data: 0.0002 max mem: 54684 +[09:12:48.089750] Epoch: [2] [2200/3229] lr: 0.000008 grad_norm: 0.5526 (0.5685) closs: 0.9832 (1.0247) time: 4.0206 data: 0.0002 max mem: 54684 +[09:13:29.107901] Epoch: [2] [2210/3229] lr: 0.000008 grad_norm: 0.5469 (0.5685) closs: 0.9921 (1.0247) time: 4.0457 data: 0.0002 max mem: 54684 +[09:14:10.541589] Epoch: [2] [2220/3229] lr: 0.000008 grad_norm: 0.5703 (0.5686) closs: 1.0479 (1.0249) time: 4.1225 data: 0.0002 max mem: 54684 +[09:14:51.639545] Epoch: [2] [2230/3229] lr: 0.000008 grad_norm: 0.5834 (0.5687) closs: 1.0858 (1.0251) time: 4.1265 data: 0.0002 max mem: 54684 +[09:15:32.541681] Epoch: [2] [2240/3229] lr: 0.000008 grad_norm: 0.5796 (0.5688) closs: 1.0672 (1.0252) time: 4.0999 data: 0.0002 max mem: 54684 +[09:16:13.933486] Epoch: [2] [2250/3229] lr: 0.000008 grad_norm: 0.5624 (0.5688) closs: 1.0850 (1.0254) time: 4.1146 data: 0.0002 max mem: 54684 +[09:16:55.107667] Epoch: [2] [2260/3229] lr: 0.000008 grad_norm: 0.5624 (0.5688) closs: 1.0799 (1.0254) time: 4.1282 data: 0.0002 max mem: 54684 +[09:17:36.230920] Epoch: [2] [2270/3229] lr: 0.000008 grad_norm: 0.5839 (0.5690) closs: 1.0627 (1.0256) time: 4.1148 data: 0.0002 max mem: 54684 +[09:18:16.446570] Epoch: [2] [2280/3229] lr: 0.000008 grad_norm: 0.5705 (0.5688) closs: 1.0307 (1.0255) time: 4.0669 data: 0.0002 max mem: 54684 +[09:18:57.941452] Epoch: [2] [2290/3229] lr: 0.000008 grad_norm: 0.5705 (0.5689) closs: 1.0202 (1.0255) time: 4.0855 data: 0.0002 max mem: 54684 +[09:19:38.974231] Epoch: [2] [2300/3229] lr: 0.000007 grad_norm: 0.5739 (0.5689) closs: 1.0446 (1.0256) time: 4.1263 data: 0.0002 max mem: 54684 +[09:20:19.518699] Epoch: [2] [2310/3229] lr: 0.000007 grad_norm: 0.5646 (0.5688) closs: 1.0285 (1.0256) time: 4.0788 data: 0.0002 max mem: 54684 +[09:21:00.784491] Epoch: [2] [2320/3229] lr: 0.000007 grad_norm: 0.5741 (0.5690) closs: 1.0014 (1.0254) time: 4.0904 data: 0.0002 max mem: 54684 +[09:21:41.070228] Epoch: [2] [2330/3229] lr: 0.000007 grad_norm: 0.5626 (0.5689) closs: 0.9974 (1.0253) time: 4.0775 data: 0.0002 max mem: 54684 +[09:22:22.069016] Epoch: [2] [2340/3229] lr: 0.000007 grad_norm: 0.5483 (0.5690) closs: 1.0038 (1.0253) time: 4.0642 data: 0.0002 max mem: 54684 +[09:23:02.881018] Epoch: [2] [2350/3229] lr: 0.000007 grad_norm: 0.5861 (0.5690) closs: 1.0259 (1.0253) time: 4.0905 data: 0.0002 max mem: 54684 +[09:23:42.648379] Epoch: [2] [2360/3229] lr: 0.000007 grad_norm: 0.5786 (0.5688) closs: 0.9870 (1.0251) time: 4.0289 data: 0.0002 max mem: 54684 +[09:24:23.453167] Epoch: [2] [2370/3229] lr: 0.000007 grad_norm: 0.5539 (0.5689) closs: 0.9815 (1.0251) time: 4.0285 data: 0.0002 max mem: 54684 +[09:25:04.136256] Epoch: [2] [2380/3229] lr: 0.000007 grad_norm: 0.5700 (0.5688) closs: 0.9940 (1.0251) time: 4.0743 data: 0.0002 max mem: 54684 +[09:25:45.337800] Epoch: [2] [2390/3229] lr: 0.000007 grad_norm: 0.5788 (0.5690) closs: 1.0564 (1.0252) time: 4.0942 data: 0.0002 max mem: 54684 +[09:26:25.739172] Epoch: [2] [2400/3229] lr: 0.000007 grad_norm: 0.5754 (0.5689) closs: 1.0595 (1.0252) time: 4.0801 data: 0.0002 max mem: 54684 +[09:27:05.964659] Epoch: [2] [2410/3229] lr: 0.000007 grad_norm: 0.5509 (0.5688) closs: 1.0436 (1.0250) time: 4.0313 data: 0.0002 max mem: 54684 +[09:27:46.656831] Epoch: [2] [2420/3229] lr: 0.000007 grad_norm: 0.5834 (0.5689) closs: 1.0413 (1.0250) time: 4.0458 data: 0.0002 max mem: 54684 +[09:28:27.493028] Epoch: [2] [2430/3229] lr: 0.000007 grad_norm: 0.5834 (0.5689) closs: 1.0413 (1.0250) time: 4.0764 data: 0.0002 max mem: 54684 +[09:29:08.900562] Epoch: [2] [2440/3229] lr: 0.000007 grad_norm: 0.5791 (0.5690) closs: 1.0370 (1.0250) time: 4.1121 data: 0.0002 max mem: 54684 +[09:29:49.772338] Epoch: [2] [2450/3229] lr: 0.000007 grad_norm: 0.5795 (0.5690) closs: 1.0208 (1.0250) time: 4.1139 data: 0.0002 max mem: 54684 +[09:30:31.137605] Epoch: [2] [2460/3229] lr: 0.000007 grad_norm: 0.5857 (0.5691) closs: 1.0189 (1.0250) time: 4.1118 data: 0.0002 max mem: 54684 +[09:31:12.282940] Epoch: [2] [2470/3229] lr: 0.000007 grad_norm: 0.5806 (0.5691) closs: 1.0523 (1.0251) time: 4.1255 data: 0.0002 max mem: 54684 +[09:31:53.328571] Epoch: [2] [2480/3229] lr: 0.000007 grad_norm: 0.5814 (0.5691) closs: 1.0437 (1.0251) time: 4.1095 data: 0.0002 max mem: 54684 +[09:32:34.487775] Epoch: [2] [2490/3229] lr: 0.000007 grad_norm: 0.5810 (0.5692) closs: 1.0437 (1.0253) time: 4.1102 data: 0.0002 max mem: 54684 +[09:33:14.819132] Epoch: [2] [2500/3229] lr: 0.000007 grad_norm: 0.5729 (0.5691) closs: 1.0522 (1.0252) time: 4.0745 data: 0.0002 max mem: 54684 +[09:33:55.981325] Epoch: [2] [2510/3229] lr: 0.000006 grad_norm: 0.5781 (0.5692) closs: 1.0177 (1.0253) time: 4.0746 data: 0.0002 max mem: 54684 +[09:34:36.645545] Epoch: [2] [2520/3229] lr: 0.000006 grad_norm: 0.5797 (0.5692) closs: 1.0270 (1.0253) time: 4.0913 data: 0.0002 max mem: 54684 +[09:35:17.317073] Epoch: [2] [2530/3229] lr: 0.000006 grad_norm: 0.5780 (0.5692) closs: 0.9929 (1.0253) time: 4.0667 data: 0.0002 max mem: 54684 +[09:35:58.540016] Epoch: [2] [2540/3229] lr: 0.000006 grad_norm: 0.5648 (0.5693) closs: 0.9905 (1.0252) time: 4.0947 data: 0.0002 max mem: 54684 +[09:36:39.741723] Epoch: [2] [2550/3229] lr: 0.000006 grad_norm: 0.6054 (0.5695) closs: 0.9905 (1.0251) time: 4.1212 data: 0.0002 max mem: 54684 +[09:37:21.111863] Epoch: [2] [2560/3229] lr: 0.000006 grad_norm: 0.5947 (0.5695) closs: 1.0207 (1.0253) time: 4.1285 data: 0.0002 max mem: 54684 +[09:38:02.411083] Epoch: [2] [2570/3229] lr: 0.000006 grad_norm: 0.6009 (0.5698) closs: 1.0260 (1.0253) time: 4.1334 data: 0.0002 max mem: 54684 +[09:38:43.003989] Epoch: [2] [2580/3229] lr: 0.000006 grad_norm: 0.6009 (0.5698) closs: 1.0059 (1.0252) time: 4.0945 data: 0.0002 max mem: 54684 +[09:39:23.209991] Epoch: [2] [2590/3229] lr: 0.000006 grad_norm: 0.5756 (0.5697) closs: 0.9988 (1.0251) time: 4.0399 data: 0.0002 max mem: 54684 +[09:40:03.959666] Epoch: [2] [2600/3229] lr: 0.000006 grad_norm: 0.5520 (0.5696) closs: 1.0161 (1.0251) time: 4.0477 data: 0.0002 max mem: 54684 +[09:40:45.247321] Epoch: [2] [2610/3229] lr: 0.000006 grad_norm: 0.5601 (0.5697) closs: 1.0427 (1.0251) time: 4.1018 data: 0.0002 max mem: 54684 +[09:41:26.150398] Epoch: [2] [2620/3229] lr: 0.000006 grad_norm: 0.5791 (0.5698) closs: 1.0075 (1.0251) time: 4.1095 data: 0.0002 max mem: 54684 +[09:42:06.649226] Epoch: [2] [2630/3229] lr: 0.000006 grad_norm: 0.5703 (0.5697) closs: 1.0049 (1.0249) time: 4.0700 data: 0.0002 max mem: 54684 +[09:42:48.023850] Epoch: [2] [2640/3229] lr: 0.000006 grad_norm: 0.5535 (0.5697) closs: 1.0153 (1.0251) time: 4.0936 data: 0.0002 max mem: 54684 +[09:43:29.335232] Epoch: [2] [2650/3229] lr: 0.000006 grad_norm: 0.5492 (0.5698) closs: 1.0620 (1.0252) time: 4.1342 data: 0.0002 max mem: 54684 +[09:44:10.557760] Epoch: [2] [2660/3229] lr: 0.000006 grad_norm: 0.5899 (0.5698) closs: 1.0605 (1.0253) time: 4.1266 data: 0.0002 max mem: 54684 +[09:44:51.395990] Epoch: [2] [2670/3229] lr: 0.000006 grad_norm: 0.5899 (0.5699) closs: 1.0605 (1.0254) time: 4.1030 data: 0.0002 max mem: 54684 +[09:45:32.073560] Epoch: [2] [2680/3229] lr: 0.000006 grad_norm: 0.5913 (0.5699) closs: 1.0339 (1.0254) time: 4.0757 data: 0.0002 max mem: 54684 +[09:46:12.446721] Epoch: [2] [2690/3229] lr: 0.000006 grad_norm: 0.5587 (0.5698) closs: 1.0229 (1.0253) time: 4.0525 data: 0.0002 max mem: 54684 +[09:46:53.342194] Epoch: [2] [2700/3229] lr: 0.000006 grad_norm: 0.5619 (0.5699) closs: 1.0266 (1.0254) time: 4.0634 data: 0.0002 max mem: 54684 +[09:47:34.230607] Epoch: [2] [2710/3229] lr: 0.000006 grad_norm: 0.5640 (0.5699) closs: 1.0674 (1.0255) time: 4.0891 data: 0.0002 max mem: 54684 +[09:48:14.935975] Epoch: [2] [2720/3229] lr: 0.000006 grad_norm: 0.5622 (0.5698) closs: 1.0250 (1.0254) time: 4.0796 data: 0.0002 max mem: 54684 +[09:48:56.405424] Epoch: [2] [2730/3229] lr: 0.000006 grad_norm: 0.5622 (0.5699) closs: 1.0256 (1.0255) time: 4.1087 data: 0.0002 max mem: 54684 +[09:49:37.167283] Epoch: [2] [2740/3229] lr: 0.000006 grad_norm: 0.5708 (0.5699) closs: 1.0323 (1.0255) time: 4.1115 data: 0.0002 max mem: 54684 +[09:50:18.316921] Epoch: [2] [2750/3229] lr: 0.000006 grad_norm: 0.5862 (0.5700) closs: 1.0592 (1.0257) time: 4.0955 data: 0.0002 max mem: 54684 +[09:50:59.109573] Epoch: [2] [2760/3229] lr: 0.000006 grad_norm: 0.5722 (0.5700) closs: 1.0711 (1.0257) time: 4.0970 data: 0.0002 max mem: 54684 +[09:51:40.256779] Epoch: [2] [2770/3229] lr: 0.000006 grad_norm: 0.5834 (0.5701) closs: 1.0126 (1.0257) time: 4.0969 data: 0.0002 max mem: 54684 +[09:52:21.033361] Epoch: [2] [2780/3229] lr: 0.000006 grad_norm: 0.5853 (0.5701) closs: 1.0494 (1.0258) time: 4.0961 data: 0.0002 max mem: 54684 +[09:53:01.853593] Epoch: [2] [2790/3229] lr: 0.000006 grad_norm: 0.5853 (0.5701) closs: 1.0702 (1.0259) time: 4.0798 data: 0.0002 max mem: 54684 +[09:53:42.906057] Epoch: [2] [2800/3229] lr: 0.000006 grad_norm: 0.5772 (0.5701) closs: 1.0605 (1.0260) time: 4.0936 data: 0.0002 max mem: 54684 +[09:54:24.030976] Epoch: [2] [2810/3229] lr: 0.000006 grad_norm: 0.5667 (0.5702) closs: 1.0430 (1.0260) time: 4.1088 data: 0.0002 max mem: 54684 +[09:55:04.890562] Epoch: [2] [2820/3229] lr: 0.000005 grad_norm: 0.5547 (0.5701) closs: 1.0131 (1.0259) time: 4.0992 data: 0.0002 max mem: 54684 +[09:55:45.343232] Epoch: [2] [2830/3229] lr: 0.000005 grad_norm: 0.5592 (0.5701) closs: 1.0131 (1.0258) time: 4.0655 data: 0.0002 max mem: 54684 +[09:56:26.023640] Epoch: [2] [2840/3229] lr: 0.000005 grad_norm: 0.5666 (0.5700) closs: 0.9882 (1.0257) time: 4.0566 data: 0.0002 max mem: 54684 +[09:57:07.375630] Epoch: [2] [2850/3229] lr: 0.000005 grad_norm: 0.5781 (0.5701) closs: 1.0202 (1.0258) time: 4.1016 data: 0.0002 max mem: 54684 +[09:57:47.932124] Epoch: [2] [2860/3229] lr: 0.000005 grad_norm: 0.5781 (0.5701) closs: 1.0452 (1.0258) time: 4.0954 data: 0.0002 max mem: 54684 +[09:58:28.805268] Epoch: [2] [2870/3229] lr: 0.000005 grad_norm: 0.5698 (0.5702) closs: 1.0353 (1.0257) time: 4.0714 data: 0.0002 max mem: 54684 +[09:59:10.101929] Epoch: [2] [2880/3229] lr: 0.000005 grad_norm: 0.5847 (0.5703) closs: 1.0293 (1.0259) time: 4.1084 data: 0.0002 max mem: 54684 +[09:59:51.257363] Epoch: [2] [2890/3229] lr: 0.000005 grad_norm: 0.5847 (0.5703) closs: 1.0182 (1.0256) time: 4.1225 data: 0.0002 max mem: 54684 +[10:00:32.076254] Epoch: [2] [2900/3229] lr: 0.000005 grad_norm: 0.5885 (0.5703) closs: 1.0088 (1.0256) time: 4.0987 data: 0.0002 max mem: 54684 +[10:01:13.043669] Epoch: [2] [2910/3229] lr: 0.000005 grad_norm: 0.5893 (0.5704) closs: 1.0251 (1.0256) time: 4.0892 data: 0.0002 max mem: 54684 +[10:01:54.261319] Epoch: [2] [2920/3229] lr: 0.000005 grad_norm: 0.5820 (0.5704) closs: 1.0368 (1.0257) time: 4.1092 data: 0.0002 max mem: 54684 +[10:02:35.084970] Epoch: [2] [2930/3229] lr: 0.000005 grad_norm: 0.5802 (0.5704) closs: 1.0487 (1.0256) time: 4.1020 data: 0.0002 max mem: 54684 +[10:03:16.214095] Epoch: [2] [2940/3229] lr: 0.000005 grad_norm: 0.5631 (0.5704) closs: 1.0370 (1.0256) time: 4.0976 data: 0.0002 max mem: 54684 +[10:03:56.615821] Epoch: [2] [2950/3229] lr: 0.000005 grad_norm: 0.5883 (0.5704) closs: 1.0335 (1.0255) time: 4.0765 data: 0.0002 max mem: 54684 +[10:04:36.729356] Epoch: [2] [2960/3229] lr: 0.000005 grad_norm: 0.5883 (0.5704) closs: 0.9811 (1.0254) time: 4.0257 data: 0.0002 max mem: 54684 +[10:05:17.405065] Epoch: [2] [2970/3229] lr: 0.000005 grad_norm: 0.6052 (0.5705) closs: 1.0261 (1.0253) time: 4.0394 data: 0.0002 max mem: 54684 +[10:05:58.588228] Epoch: [2] [2980/3229] lr: 0.000005 grad_norm: 0.5752 (0.5705) closs: 1.0282 (1.0253) time: 4.0929 data: 0.0002 max mem: 54684 +[10:06:39.618475] Epoch: [2] [2990/3229] lr: 0.000005 grad_norm: 0.5626 (0.5706) closs: 1.0620 (1.0255) time: 4.1106 data: 0.0002 max mem: 54684 +[10:07:20.501679] Epoch: [2] [3000/3229] lr: 0.000005 grad_norm: 0.5966 (0.5707) closs: 1.0762 (1.0255) time: 4.0956 data: 0.0002 max mem: 54684 +[10:08:00.845965] Epoch: [2] [3010/3229] lr: 0.000005 grad_norm: 0.6088 (0.5708) closs: 0.9854 (1.0254) time: 4.0613 data: 0.0002 max mem: 54684 +[10:08:41.987050] Epoch: [2] [3020/3229] lr: 0.000005 grad_norm: 0.5670 (0.5708) closs: 1.0015 (1.0254) time: 4.0742 data: 0.0002 max mem: 54684 +[10:09:23.349997] Epoch: [2] [3030/3229] lr: 0.000005 grad_norm: 0.5670 (0.5708) closs: 1.0505 (1.0256) time: 4.1251 data: 0.0002 max mem: 54684 +[10:10:04.141407] Epoch: [2] [3040/3229] lr: 0.000005 grad_norm: 0.5687 (0.5708) closs: 1.0496 (1.0255) time: 4.1076 data: 0.0002 max mem: 54684 +[10:10:45.555383] Epoch: [2] [3050/3229] lr: 0.000005 grad_norm: 0.5653 (0.5708) closs: 1.0073 (1.0255) time: 4.1102 data: 0.0002 max mem: 54684 +[10:11:26.441571] Epoch: [2] [3060/3229] lr: 0.000005 grad_norm: 0.5727 (0.5708) closs: 1.0073 (1.0255) time: 4.1149 data: 0.0002 max mem: 54684 +[10:12:07.543100] Epoch: [2] [3070/3229] lr: 0.000005 grad_norm: 0.5985 (0.5709) closs: 1.0305 (1.0254) time: 4.0993 data: 0.0002 max mem: 54684 +[10:12:48.448900] Epoch: [2] [3080/3229] lr: 0.000005 grad_norm: 0.5755 (0.5709) closs: 1.0231 (1.0253) time: 4.1003 data: 0.0002 max mem: 54684 +[10:13:29.389636] Epoch: [2] [3090/3229] lr: 0.000005 grad_norm: 0.5588 (0.5708) closs: 1.0231 (1.0254) time: 4.0923 data: 0.0002 max mem: 54684 +[10:14:10.537731] Epoch: [2] [3100/3229] lr: 0.000005 grad_norm: 0.5669 (0.5709) closs: 1.0588 (1.0256) time: 4.1044 data: 0.0002 max mem: 54684 +[10:14:50.940345] Epoch: [2] [3110/3229] lr: 0.000005 grad_norm: 0.5674 (0.5708) closs: 1.0216 (1.0253) time: 4.0775 data: 0.0002 max mem: 54684 +[10:15:31.303846] Epoch: [2] [3120/3229] lr: 0.000005 grad_norm: 0.5456 (0.5708) closs: 1.0082 (1.0254) time: 4.0382 data: 0.0002 max mem: 54684 +[10:16:11.884304] Epoch: [2] [3130/3229] lr: 0.000005 grad_norm: 0.5562 (0.5707) closs: 1.0350 (1.0254) time: 4.0471 data: 0.0002 max mem: 54684 +[10:16:52.052398] Epoch: [2] [3140/3229] lr: 0.000005 grad_norm: 0.5667 (0.5708) closs: 1.0199 (1.0254) time: 4.0374 data: 0.0002 max mem: 54684 +[10:17:32.075073] Epoch: [2] [3150/3229] lr: 0.000005 grad_norm: 0.5711 (0.5707) closs: 0.9967 (1.0252) time: 4.0095 data: 0.0002 max mem: 54684 +[10:18:13.395303] Epoch: [2] [3160/3229] lr: 0.000005 grad_norm: 0.5789 (0.5708) closs: 0.9967 (1.0252) time: 4.0671 data: 0.0002 max mem: 54684 +[10:18:53.970480] Epoch: [2] [3170/3229] lr: 0.000005 grad_norm: 0.5914 (0.5708) closs: 1.0415 (1.0252) time: 4.0947 data: 0.0002 max mem: 54684 +[10:19:34.476943] Epoch: [2] [3180/3229] lr: 0.000005 grad_norm: 0.5590 (0.5708) closs: 1.0227 (1.0252) time: 4.0540 data: 0.0002 max mem: 54684 +[10:20:15.519690] Epoch: [2] [3190/3229] lr: 0.000005 grad_norm: 0.5594 (0.5708) closs: 1.0227 (1.0252) time: 4.0774 data: 0.0002 max mem: 54684 +[10:20:56.962083] Epoch: [2] [3200/3229] lr: 0.000005 grad_norm: 0.5860 (0.5709) closs: 1.0387 (1.0253) time: 4.1242 data: 0.0003 max mem: 54684 +[10:21:38.056015] Epoch: [2] [3210/3229] lr: 0.000005 grad_norm: 0.5810 (0.5709) closs: 1.0467 (1.0253) time: 4.1268 data: 0.0003 max mem: 54684 +[10:22:17.979929] Epoch: [2] [3220/3229] lr: 0.000005 grad_norm: 0.5591 (0.5708) closs: 1.0067 (1.0252) time: 4.0508 data: 0.0001 max mem: 54684 +[10:22:51.264502] Epoch: [2] Total time: 3:40:11 +[10:22:51.265330] Averaged stats: lr: 0.000005 grad_norm: 0.5545 (0.5708) closs: 1.0458 (1.0262) +[10:22:51.602101] model saved +[10:22:53.305325] optimizer saved +[10:22:53.305942] other rank-common saved +[10:22:53.310857] rank-specific saved +[10:22:53.311056] Training time 11:00:25 diff --git a/finetune/sg/platypus_normBias_QF_70B_2048/epoch0/consolidated.00-of-01.model.pth b/finetune/sg/platypus_normBias_QF_70B_2048/epoch0/consolidated.00-of-01.model.pth new file mode 100644 index 0000000000000000000000000000000000000000..03e77d3bd182abd05ab6f0fd2a283a162c4594b5 --- /dev/null +++ b/finetune/sg/platypus_normBias_QF_70B_2048/epoch0/consolidated.00-of-01.model.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:181504002e3aec111222c7f2723fd0a02e5e6a7ca993bed3f9e9865af099805f +size 16308187 diff --git a/finetune/sg/platypus_normBias_QF_70B_2048/epoch0/consolidated.00-of-01.optimizer.pth b/finetune/sg/platypus_normBias_QF_70B_2048/epoch0/consolidated.00-of-01.optimizer.pth new file mode 100644 index 0000000000000000000000000000000000000000..c074aaea78df67930a757bed1ac46bd14df1b9ae --- /dev/null +++ b/finetune/sg/platypus_normBias_QF_70B_2048/epoch0/consolidated.00-of-01.optimizer.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5e36be2dfbf359d72c8ef1b12a22db7ec9eea34b085454f2954627fe1939cbf9 +size 64801559 diff --git a/finetune/sg/platypus_normBias_QF_70B_2048/epoch0/consolidated.00-of-01.other.pth b/finetune/sg/platypus_normBias_QF_70B_2048/epoch0/consolidated.00-of-01.other.pth new file mode 100644 index 0000000000000000000000000000000000000000..af3ce00d3918fe048fb7c07c06084e088400937a --- /dev/null +++ b/finetune/sg/platypus_normBias_QF_70B_2048/epoch0/consolidated.00-of-01.other.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b8a94ea9a9775bb26567f9bc0b23b6fc6eff533e7066e9cf9fffaffe57dfcb50 +size 1687 diff --git a/finetune/sg/platypus_normBias_QF_70B_2048/epoch0/rank-specific-00000-of-00008.pth b/finetune/sg/platypus_normBias_QF_70B_2048/epoch0/rank-specific-00000-of-00008.pth new file mode 100644 index 0000000000000000000000000000000000000000..52b71af1a9ce3ed182e1185cac54dc42f12a5fb6 --- /dev/null +++ b/finetune/sg/platypus_normBias_QF_70B_2048/epoch0/rank-specific-00000-of-00008.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ec2932635da1a4de71c34aa8fcbcba91dfb0ac1ddc7859f8f87280546b7e786a +size 537 diff --git a/finetune/sg/platypus_normBias_QF_70B_2048/epoch0/rank-specific-00001-of-00008.pth b/finetune/sg/platypus_normBias_QF_70B_2048/epoch0/rank-specific-00001-of-00008.pth new file mode 100644 index 0000000000000000000000000000000000000000..20d239dfd49c5dfac4b0e9262df10a199c383e22 --- /dev/null +++ b/finetune/sg/platypus_normBias_QF_70B_2048/epoch0/rank-specific-00001-of-00008.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:88973b3c418b507bcde1467ec3902218b83d95fe4e022aca11b09c3f86cde7ac +size 537 diff --git a/finetune/sg/platypus_normBias_QF_70B_2048/epoch0/rank-specific-00002-of-00008.pth b/finetune/sg/platypus_normBias_QF_70B_2048/epoch0/rank-specific-00002-of-00008.pth new file mode 100644 index 0000000000000000000000000000000000000000..44d15a9615f46731b4d1be2302ed11c2e22c5889 --- /dev/null +++ b/finetune/sg/platypus_normBias_QF_70B_2048/epoch0/rank-specific-00002-of-00008.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:eee15a274ea5f27c0360c85bd878d6e0f2072076cae26311c52798f7d836643a +size 537 diff --git a/finetune/sg/platypus_normBias_QF_70B_2048/epoch0/rank-specific-00003-of-00008.pth b/finetune/sg/platypus_normBias_QF_70B_2048/epoch0/rank-specific-00003-of-00008.pth new file mode 100644 index 0000000000000000000000000000000000000000..c02a05b764b46a3e2ea7f50bab8449d0128a76d9 --- /dev/null +++ b/finetune/sg/platypus_normBias_QF_70B_2048/epoch0/rank-specific-00003-of-00008.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:61651d612914693bf494e5609388a6f9239090c45b3abcc9c4fa5c7a814c7a7e +size 537 diff --git a/finetune/sg/platypus_normBias_QF_70B_2048/epoch0/rank-specific-00004-of-00008.pth b/finetune/sg/platypus_normBias_QF_70B_2048/epoch0/rank-specific-00004-of-00008.pth new file mode 100644 index 0000000000000000000000000000000000000000..f9bdc7b095dfaed08b7ebb500fa76f2562a86c2c --- /dev/null +++ b/finetune/sg/platypus_normBias_QF_70B_2048/epoch0/rank-specific-00004-of-00008.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8cd6ad8f3d2bcfa25c957717227143e64751970f9b367b28b205a5084a8f476a +size 537 diff --git a/finetune/sg/platypus_normBias_QF_70B_2048/epoch0/rank-specific-00005-of-00008.pth b/finetune/sg/platypus_normBias_QF_70B_2048/epoch0/rank-specific-00005-of-00008.pth new file mode 100644 index 0000000000000000000000000000000000000000..93470a083d27c6e079dfb735e0a4fa8b7f6b0249 --- /dev/null +++ b/finetune/sg/platypus_normBias_QF_70B_2048/epoch0/rank-specific-00005-of-00008.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bf049e1944a87da00e6860d1884d0eb312dc5a389a832a4e76a582493ec26972 +size 537 diff --git a/finetune/sg/platypus_normBias_QF_70B_2048/epoch0/rank-specific-00006-of-00008.pth b/finetune/sg/platypus_normBias_QF_70B_2048/epoch0/rank-specific-00006-of-00008.pth new file mode 100644 index 0000000000000000000000000000000000000000..90e3ca8659ab49b709193c41ea8923e9f7217d09 --- /dev/null +++ b/finetune/sg/platypus_normBias_QF_70B_2048/epoch0/rank-specific-00006-of-00008.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8174e84cf8a0553f73baf42bd13d65974b85944a834fa7f75433c0be044e2f04 +size 537 diff --git a/finetune/sg/platypus_normBias_QF_70B_2048/epoch0/rank-specific-00007-of-00008.pth b/finetune/sg/platypus_normBias_QF_70B_2048/epoch0/rank-specific-00007-of-00008.pth new file mode 100644 index 0000000000000000000000000000000000000000..6530350b10d02e206562d6d0b29a46a26d742899 --- /dev/null +++ b/finetune/sg/platypus_normBias_QF_70B_2048/epoch0/rank-specific-00007-of-00008.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fb6f9198ace60febfc0ad5d85588a3d4021799762f521c1a6b87adc99c8889ce +size 537 diff --git a/finetune/sg/platypus_normBias_QF_70B_2048/epoch1/consolidated.00-of-01.model.pth b/finetune/sg/platypus_normBias_QF_70B_2048/epoch1/consolidated.00-of-01.model.pth new file mode 100644 index 0000000000000000000000000000000000000000..3c9de3e9f830a4913b5b2d3b353aa6ef2813d55a --- /dev/null +++ b/finetune/sg/platypus_normBias_QF_70B_2048/epoch1/consolidated.00-of-01.model.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8451335aca82666ba5a8dd00764f2ef1833aa30b651e6ef21cdd614212bc591e +size 16308187 diff --git a/finetune/sg/platypus_normBias_QF_70B_2048/epoch1/consolidated.00-of-01.optimizer.pth b/finetune/sg/platypus_normBias_QF_70B_2048/epoch1/consolidated.00-of-01.optimizer.pth new file mode 100644 index 0000000000000000000000000000000000000000..f380fac20d197f4624a29fea92546a2dda3d927e --- /dev/null +++ b/finetune/sg/platypus_normBias_QF_70B_2048/epoch1/consolidated.00-of-01.optimizer.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:af6d1e9169a8f3e6c8c001af2b09323ffc104dc524b8f87546fc6be6ef821d45 +size 64801559 diff --git a/finetune/sg/platypus_normBias_QF_70B_2048/epoch1/consolidated.00-of-01.other.pth b/finetune/sg/platypus_normBias_QF_70B_2048/epoch1/consolidated.00-of-01.other.pth new file mode 100644 index 0000000000000000000000000000000000000000..8365d872a70222c75d3639298e540ecb1eec9c2d --- /dev/null +++ b/finetune/sg/platypus_normBias_QF_70B_2048/epoch1/consolidated.00-of-01.other.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:466fbbf63358a7592a770b56b92fff0f6a28886e08660f4ede3ce415b5e3fb5f +size 1687 diff --git a/finetune/sg/platypus_normBias_QF_70B_2048/epoch1/rank-specific-00000-of-00008.pth b/finetune/sg/platypus_normBias_QF_70B_2048/epoch1/rank-specific-00000-of-00008.pth new file mode 100644 index 0000000000000000000000000000000000000000..52b71af1a9ce3ed182e1185cac54dc42f12a5fb6 --- /dev/null +++ b/finetune/sg/platypus_normBias_QF_70B_2048/epoch1/rank-specific-00000-of-00008.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ec2932635da1a4de71c34aa8fcbcba91dfb0ac1ddc7859f8f87280546b7e786a +size 537 diff --git a/finetune/sg/platypus_normBias_QF_70B_2048/epoch1/rank-specific-00001-of-00008.pth b/finetune/sg/platypus_normBias_QF_70B_2048/epoch1/rank-specific-00001-of-00008.pth new file mode 100644 index 0000000000000000000000000000000000000000..20d239dfd49c5dfac4b0e9262df10a199c383e22 --- /dev/null +++ b/finetune/sg/platypus_normBias_QF_70B_2048/epoch1/rank-specific-00001-of-00008.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:88973b3c418b507bcde1467ec3902218b83d95fe4e022aca11b09c3f86cde7ac +size 537 diff --git a/finetune/sg/platypus_normBias_QF_70B_2048/epoch1/rank-specific-00002-of-00008.pth b/finetune/sg/platypus_normBias_QF_70B_2048/epoch1/rank-specific-00002-of-00008.pth new file mode 100644 index 0000000000000000000000000000000000000000..44d15a9615f46731b4d1be2302ed11c2e22c5889 --- /dev/null +++ b/finetune/sg/platypus_normBias_QF_70B_2048/epoch1/rank-specific-00002-of-00008.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:eee15a274ea5f27c0360c85bd878d6e0f2072076cae26311c52798f7d836643a +size 537 diff --git a/finetune/sg/platypus_normBias_QF_70B_2048/epoch1/rank-specific-00003-of-00008.pth b/finetune/sg/platypus_normBias_QF_70B_2048/epoch1/rank-specific-00003-of-00008.pth new file mode 100644 index 0000000000000000000000000000000000000000..c02a05b764b46a3e2ea7f50bab8449d0128a76d9 --- /dev/null +++ b/finetune/sg/platypus_normBias_QF_70B_2048/epoch1/rank-specific-00003-of-00008.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:61651d612914693bf494e5609388a6f9239090c45b3abcc9c4fa5c7a814c7a7e +size 537 diff --git a/finetune/sg/platypus_normBias_QF_70B_2048/epoch1/rank-specific-00004-of-00008.pth b/finetune/sg/platypus_normBias_QF_70B_2048/epoch1/rank-specific-00004-of-00008.pth new file mode 100644 index 0000000000000000000000000000000000000000..f9bdc7b095dfaed08b7ebb500fa76f2562a86c2c --- /dev/null +++ b/finetune/sg/platypus_normBias_QF_70B_2048/epoch1/rank-specific-00004-of-00008.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8cd6ad8f3d2bcfa25c957717227143e64751970f9b367b28b205a5084a8f476a +size 537 diff --git a/finetune/sg/platypus_normBias_QF_70B_2048/epoch1/rank-specific-00005-of-00008.pth b/finetune/sg/platypus_normBias_QF_70B_2048/epoch1/rank-specific-00005-of-00008.pth new file mode 100644 index 0000000000000000000000000000000000000000..93470a083d27c6e079dfb735e0a4fa8b7f6b0249 --- /dev/null +++ b/finetune/sg/platypus_normBias_QF_70B_2048/epoch1/rank-specific-00005-of-00008.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bf049e1944a87da00e6860d1884d0eb312dc5a389a832a4e76a582493ec26972 +size 537 diff --git a/finetune/sg/platypus_normBias_QF_70B_2048/epoch1/rank-specific-00006-of-00008.pth b/finetune/sg/platypus_normBias_QF_70B_2048/epoch1/rank-specific-00006-of-00008.pth new file mode 100644 index 0000000000000000000000000000000000000000..90e3ca8659ab49b709193c41ea8923e9f7217d09 --- /dev/null +++ b/finetune/sg/platypus_normBias_QF_70B_2048/epoch1/rank-specific-00006-of-00008.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8174e84cf8a0553f73baf42bd13d65974b85944a834fa7f75433c0be044e2f04 +size 537 diff --git a/finetune/sg/platypus_normBias_QF_70B_2048/epoch1/rank-specific-00007-of-00008.pth b/finetune/sg/platypus_normBias_QF_70B_2048/epoch1/rank-specific-00007-of-00008.pth new file mode 100644 index 0000000000000000000000000000000000000000..6530350b10d02e206562d6d0b29a46a26d742899 --- /dev/null +++ b/finetune/sg/platypus_normBias_QF_70B_2048/epoch1/rank-specific-00007-of-00008.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fb6f9198ace60febfc0ad5d85588a3d4021799762f521c1a6b87adc99c8889ce +size 537 diff --git a/finetune/sg/platypus_normBias_QF_70B_2048/epoch2/consolidated.00-of-01.model.pth b/finetune/sg/platypus_normBias_QF_70B_2048/epoch2/consolidated.00-of-01.model.pth new file mode 100644 index 0000000000000000000000000000000000000000..76e843f95a86694a5d1bca9b534b8abb65855f46 --- /dev/null +++ b/finetune/sg/platypus_normBias_QF_70B_2048/epoch2/consolidated.00-of-01.model.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f675196dd322db9f6f0f842b28f9297c783f93df2247b81f6a84b42ccc38b5e1 +size 16308187 diff --git a/finetune/sg/platypus_normBias_QF_70B_2048/epoch2/consolidated.00-of-01.optimizer.pth b/finetune/sg/platypus_normBias_QF_70B_2048/epoch2/consolidated.00-of-01.optimizer.pth new file mode 100644 index 0000000000000000000000000000000000000000..8a22f347c1a5a3eae4c227602ecc5977460c4f5d --- /dev/null +++ b/finetune/sg/platypus_normBias_QF_70B_2048/epoch2/consolidated.00-of-01.optimizer.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c301150e4d5293b42921bc09a9cc4de794ed92d85d2c0c106051830db56a3062 +size 64801559 diff --git a/finetune/sg/platypus_normBias_QF_70B_2048/epoch2/consolidated.00-of-01.other.pth b/finetune/sg/platypus_normBias_QF_70B_2048/epoch2/consolidated.00-of-01.other.pth new file mode 100644 index 0000000000000000000000000000000000000000..df48808eee205ff49c79c1af8905d3c4e38711c7 --- /dev/null +++ b/finetune/sg/platypus_normBias_QF_70B_2048/epoch2/consolidated.00-of-01.other.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:809ed242fb2f8056607b961f849a4262895f1dfaa4a8cad3baa217d8c85aa32e +size 1687 diff --git a/finetune/sg/platypus_normBias_QF_70B_2048/epoch2/rank-specific-00000-of-00008.pth b/finetune/sg/platypus_normBias_QF_70B_2048/epoch2/rank-specific-00000-of-00008.pth new file mode 100644 index 0000000000000000000000000000000000000000..52b71af1a9ce3ed182e1185cac54dc42f12a5fb6 --- /dev/null +++ b/finetune/sg/platypus_normBias_QF_70B_2048/epoch2/rank-specific-00000-of-00008.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ec2932635da1a4de71c34aa8fcbcba91dfb0ac1ddc7859f8f87280546b7e786a +size 537 diff --git a/finetune/sg/platypus_normBias_QF_70B_2048/epoch2/rank-specific-00001-of-00008.pth b/finetune/sg/platypus_normBias_QF_70B_2048/epoch2/rank-specific-00001-of-00008.pth new file mode 100644 index 0000000000000000000000000000000000000000..20d239dfd49c5dfac4b0e9262df10a199c383e22 --- /dev/null +++ b/finetune/sg/platypus_normBias_QF_70B_2048/epoch2/rank-specific-00001-of-00008.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:88973b3c418b507bcde1467ec3902218b83d95fe4e022aca11b09c3f86cde7ac +size 537 diff --git a/finetune/sg/platypus_normBias_QF_70B_2048/epoch2/rank-specific-00002-of-00008.pth b/finetune/sg/platypus_normBias_QF_70B_2048/epoch2/rank-specific-00002-of-00008.pth new file mode 100644 index 0000000000000000000000000000000000000000..44d15a9615f46731b4d1be2302ed11c2e22c5889 --- /dev/null +++ b/finetune/sg/platypus_normBias_QF_70B_2048/epoch2/rank-specific-00002-of-00008.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:eee15a274ea5f27c0360c85bd878d6e0f2072076cae26311c52798f7d836643a +size 537 diff --git a/finetune/sg/platypus_normBias_QF_70B_2048/epoch2/rank-specific-00003-of-00008.pth b/finetune/sg/platypus_normBias_QF_70B_2048/epoch2/rank-specific-00003-of-00008.pth new file mode 100644 index 0000000000000000000000000000000000000000..c02a05b764b46a3e2ea7f50bab8449d0128a76d9 --- /dev/null +++ b/finetune/sg/platypus_normBias_QF_70B_2048/epoch2/rank-specific-00003-of-00008.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:61651d612914693bf494e5609388a6f9239090c45b3abcc9c4fa5c7a814c7a7e +size 537 diff --git a/finetune/sg/platypus_normBias_QF_70B_2048/epoch2/rank-specific-00004-of-00008.pth b/finetune/sg/platypus_normBias_QF_70B_2048/epoch2/rank-specific-00004-of-00008.pth new file mode 100644 index 0000000000000000000000000000000000000000..f9bdc7b095dfaed08b7ebb500fa76f2562a86c2c --- /dev/null +++ b/finetune/sg/platypus_normBias_QF_70B_2048/epoch2/rank-specific-00004-of-00008.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8cd6ad8f3d2bcfa25c957717227143e64751970f9b367b28b205a5084a8f476a +size 537 diff --git a/finetune/sg/platypus_normBias_QF_70B_2048/epoch2/rank-specific-00005-of-00008.pth b/finetune/sg/platypus_normBias_QF_70B_2048/epoch2/rank-specific-00005-of-00008.pth new file mode 100644 index 0000000000000000000000000000000000000000..93470a083d27c6e079dfb735e0a4fa8b7f6b0249 --- /dev/null +++ b/finetune/sg/platypus_normBias_QF_70B_2048/epoch2/rank-specific-00005-of-00008.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bf049e1944a87da00e6860d1884d0eb312dc5a389a832a4e76a582493ec26972 +size 537 diff --git a/finetune/sg/platypus_normBias_QF_70B_2048/epoch2/rank-specific-00006-of-00008.pth b/finetune/sg/platypus_normBias_QF_70B_2048/epoch2/rank-specific-00006-of-00008.pth new file mode 100644 index 0000000000000000000000000000000000000000..90e3ca8659ab49b709193c41ea8923e9f7217d09 --- /dev/null +++ b/finetune/sg/platypus_normBias_QF_70B_2048/epoch2/rank-specific-00006-of-00008.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8174e84cf8a0553f73baf42bd13d65974b85944a834fa7f75433c0be044e2f04 +size 537 diff --git a/finetune/sg/platypus_normBias_QF_70B_2048/epoch2/rank-specific-00007-of-00008.pth b/finetune/sg/platypus_normBias_QF_70B_2048/epoch2/rank-specific-00007-of-00008.pth new file mode 100644 index 0000000000000000000000000000000000000000..6530350b10d02e206562d6d0b29a46a26d742899 --- /dev/null +++ b/finetune/sg/platypus_normBias_QF_70B_2048/epoch2/rank-specific-00007-of-00008.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fb6f9198ace60febfc0ad5d85588a3d4021799762f521c1a6b87adc99c8889ce +size 537 diff --git a/finetune/sg/platypus_normBias_QF_70B_2048/log.txt b/finetune/sg/platypus_normBias_QF_70B_2048/log.txt new file mode 100644 index 0000000000000000000000000000000000000000..ac4a38aa99f4d29a148628c8255e12c1eb7c1137 --- /dev/null +++ b/finetune/sg/platypus_normBias_QF_70B_2048/log.txt @@ -0,0 +1,3 @@ +{"train_lr": 1.8013243050626546e-05, "train_closs": 0.7013087929557149, "train_grad_norm": 3.5305390616155834, "epoch": 0, "val_lr": 1.8013243050626546e-05, "val_closs": 0.7013087929557149, "val_grad_norm": 3.5305390616155834} +{"train_lr": 1.020893468942151e-05, "train_closs": 0.5819548111255799, "train_grad_norm": 3.314220068923612, "epoch": 1, "val_lr": 1.020893468942151e-05, "val_closs": 0.5819548111255799, "val_grad_norm": 3.314220068923612} +{"train_lr": 1.7777786439025823e-06, "train_closs": 0.5618916607444961, "train_grad_norm": 3.116951620287638, "epoch": 2, "val_lr": 1.7777786439025823e-06, "val_closs": 0.5618916607444961, "val_grad_norm": 3.116951620287638} diff --git a/finetune/sg/platypus_normBias_QF_70B_2048/output.log b/finetune/sg/platypus_normBias_QF_70B_2048/output.log new file mode 100644 index 0000000000000000000000000000000000000000..df462f599efb134fff1dd01242ae260645838c6b --- /dev/null +++ b/finetune/sg/platypus_normBias_QF_70B_2048/output.log @@ -0,0 +1,2475 @@ +WARNING:torch.distributed.run: +***************************************** +Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +***************************************** +| distributed init (rank 2): env://, gpu 2 +| distributed init (rank 5): env://, gpu 5 +| distributed init (rank 7): env://, gpu 7 +| distributed init (rank 1): env://, gpu 1 +| distributed init (rank 6): env://, gpu 6 +| distributed init (rank 4): env://, gpu 4 +| distributed init (rank 3): env://, gpu 3 +| distributed init (rank 0): env://, gpu 0 +[01:37:48.827063] > initializing model parallel with size 1 +[01:37:48.827267] > initializing ddp with size 8 +[01:37:48.827274] > initializing pipeline with size 1 +[01:37:48.977074] job dir: /data/liuyijiang/mmlab/LLaMA2-Accessory/accessory +[01:37:48.977226] Namespace(batch_size=1, +accum_iter=4, +llama_type='llama_peft', +llama_config=['../checkpoints/llama2/Llama-2-70b/params.json'], +no_visual=True, +tokenizer_path='../checkpoints/llama2/Llama-2-70b/tokenizer.model', +pretrained_path='../checkpoints/llama2/Llama-2-70b/', +pretrained_type='meta_ori', +weight_decay=0.0, +lr=2e-05, +min_lr=0.0, +epochs=3, +warmup_epochs=0.04, +clip_grad=2, +max_words=2048, +dialog=False, +data_config='configs/data/finetune/sg/platypus.yaml', +output_dir='output/finetune/sg/platypus_normBias_QF_70B_2048', +log_dir='./output_dir', +save_interval=1, +only_save_trainable=True, +device='cuda', +seed=0, +resume='', +num_workers=8, +pin_mem=True, +world_size=8, +local_rank=-1, +dist_on_itp=False, +dist_url='env://', +model_parallel_size=1, +data_parallel='sdp', +precision='bf16', +checkpointing=True, +quant=True, +rank=0, +gpu=0, +distributed=True, +dist_backend='nccl') +[01:37:48.979334] Start initialization. +[01:37:48.979397] ## Processing on RANK 0. +[01:37:48.990463] Model Args: + ModelArgs(dim=8192, n_layers=80, n_heads=64, n_kv_heads=8, vocab_size=32000, multiple_of=4096, ffn_dim_multiplier=1.3, norm_eps=1e-05, max_batch_size=32, max_seq_len=2048, rope_scaling=None, lora_rank=-1, bias_tuning=True) +[01:45:45.802102] Model is Peft: True +[01:45:45.809931] Trainable parameter count : 8036352 (local rank), 8036352 (all). +[01:45:45.834277] Param llma.tok_embeddings.weight: requires_grad False, local_size torch.Size([32000, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.834312] Param llma.layers.0.attention.wq.weight: requires_grad False, local_size torch.Size([8192, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.834328] Param llma.layers.0.attention.wq.bias: requires_grad True, local_size torch.Size([8192]), model_parallel True, dtype torch.float32 +[01:45:45.834342] Param llma.layers.0.attention.wk.weight: requires_grad False, local_size torch.Size([1024, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.834355] Param llma.layers.0.attention.wk.bias: requires_grad True, local_size torch.Size([1024]), model_parallel True, dtype torch.float32 +[01:45:45.834370] Param llma.layers.0.attention.wv.weight: requires_grad False, local_size torch.Size([1024, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.834382] Param llma.layers.0.attention.wv.bias: requires_grad True, local_size torch.Size([1024]), model_parallel True, dtype torch.float32 +[01:45:45.834395] Param llma.layers.0.attention.wo.weight: requires_grad False, local_size torch.Size([8192, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.834407] Param llma.layers.0.attention.wo.bias: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.834422] Param llma.layers.0.feed_forward.w1.weight: requires_grad False, local_size torch.Size([28672, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.834435] Param llma.layers.0.feed_forward.w1.bias: requires_grad True, local_size torch.Size([28672]), model_parallel True, dtype torch.float32 +[01:45:45.834449] Param llma.layers.0.feed_forward.w2.weight: requires_grad False, local_size torch.Size([8192, 28672]), model_parallel True, dtype torch.bfloat16 +[01:45:45.834461] Param llma.layers.0.feed_forward.w2.bias: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.834474] Param llma.layers.0.feed_forward.w3.weight: requires_grad False, local_size torch.Size([28672, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.834485] Param llma.layers.0.feed_forward.w3.bias: requires_grad True, local_size torch.Size([28672]), model_parallel True, dtype torch.float32 +[01:45:45.834499] Param llma.layers.0.attention_norm.weight: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.834512] Param llma.layers.0.ffn_norm.weight: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.834530] Param llma.layers.1.attention.wq.weight: requires_grad False, local_size torch.Size([8192, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.834543] Param llma.layers.1.attention.wq.bias: requires_grad True, local_size torch.Size([8192]), model_parallel True, dtype torch.float32 +[01:45:45.834557] Param llma.layers.1.attention.wk.weight: requires_grad False, local_size torch.Size([1024, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.834569] Param llma.layers.1.attention.wk.bias: requires_grad True, local_size torch.Size([1024]), model_parallel True, dtype torch.float32 +[01:45:45.834583] Param llma.layers.1.attention.wv.weight: requires_grad False, local_size torch.Size([1024, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.834596] Param llma.layers.1.attention.wv.bias: requires_grad True, local_size torch.Size([1024]), model_parallel True, dtype torch.float32 +[01:45:45.834609] Param llma.layers.1.attention.wo.weight: requires_grad False, local_size torch.Size([8192, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.834621] Param llma.layers.1.attention.wo.bias: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.834636] Param llma.layers.1.feed_forward.w1.weight: requires_grad False, local_size torch.Size([28672, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.834648] Param llma.layers.1.feed_forward.w1.bias: requires_grad True, local_size torch.Size([28672]), model_parallel True, dtype torch.float32 +[01:45:45.834661] Param llma.layers.1.feed_forward.w2.weight: requires_grad False, local_size torch.Size([8192, 28672]), model_parallel True, dtype torch.bfloat16 +[01:45:45.834674] Param llma.layers.1.feed_forward.w2.bias: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.834687] Param llma.layers.1.feed_forward.w3.weight: requires_grad False, local_size torch.Size([28672, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.834711] Param llma.layers.1.feed_forward.w3.bias: requires_grad True, local_size torch.Size([28672]), model_parallel True, dtype torch.float32 +[01:45:45.834725] Param llma.layers.1.attention_norm.weight: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.834739] Param llma.layers.1.ffn_norm.weight: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.834755] Param llma.layers.2.attention.wq.weight: requires_grad False, local_size torch.Size([8192, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.834767] Param llma.layers.2.attention.wq.bias: requires_grad True, local_size torch.Size([8192]), model_parallel True, dtype torch.float32 +[01:45:45.834780] Param llma.layers.2.attention.wk.weight: requires_grad False, local_size torch.Size([1024, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.834792] Param llma.layers.2.attention.wk.bias: requires_grad True, local_size torch.Size([1024]), model_parallel True, dtype torch.float32 +[01:45:45.834805] Param llma.layers.2.attention.wv.weight: requires_grad False, local_size torch.Size([1024, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.834817] Param llma.layers.2.attention.wv.bias: requires_grad True, local_size torch.Size([1024]), model_parallel True, dtype torch.float32 +[01:45:45.834830] Param llma.layers.2.attention.wo.weight: requires_grad False, local_size torch.Size([8192, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.834842] Param llma.layers.2.attention.wo.bias: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.834857] Param llma.layers.2.feed_forward.w1.weight: requires_grad False, local_size torch.Size([28672, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.834869] Param llma.layers.2.feed_forward.w1.bias: requires_grad True, local_size torch.Size([28672]), model_parallel True, dtype torch.float32 +[01:45:45.834882] Param llma.layers.2.feed_forward.w2.weight: requires_grad False, local_size torch.Size([8192, 28672]), model_parallel True, dtype torch.bfloat16 +[01:45:45.834894] Param llma.layers.2.feed_forward.w2.bias: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.834907] Param llma.layers.2.feed_forward.w3.weight: requires_grad False, local_size torch.Size([28672, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.834919] Param llma.layers.2.feed_forward.w3.bias: requires_grad True, local_size torch.Size([28672]), model_parallel True, dtype torch.float32 +[01:45:45.834932] Param llma.layers.2.attention_norm.weight: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.834945] Param llma.layers.2.ffn_norm.weight: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.834960] Param llma.layers.3.attention.wq.weight: requires_grad False, local_size torch.Size([8192, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.834973] Param llma.layers.3.attention.wq.bias: requires_grad True, local_size torch.Size([8192]), model_parallel True, dtype torch.float32 +[01:45:45.834986] Param llma.layers.3.attention.wk.weight: requires_grad False, local_size torch.Size([1024, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.834998] Param llma.layers.3.attention.wk.bias: requires_grad True, local_size torch.Size([1024]), model_parallel True, dtype torch.float32 +[01:45:45.835011] Param llma.layers.3.attention.wv.weight: requires_grad False, local_size torch.Size([1024, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.835023] Param llma.layers.3.attention.wv.bias: requires_grad True, local_size torch.Size([1024]), model_parallel True, dtype torch.float32 +[01:45:45.835037] Param llma.layers.3.attention.wo.weight: requires_grad False, local_size torch.Size([8192, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.835048] Param llma.layers.3.attention.wo.bias: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.835073] Param llma.layers.3.feed_forward.w1.weight: requires_grad False, local_size torch.Size([28672, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.835085] Param llma.layers.3.feed_forward.w1.bias: requires_grad True, local_size torch.Size([28672]), model_parallel True, dtype torch.float32 +[01:45:45.835099] Param llma.layers.3.feed_forward.w2.weight: requires_grad False, local_size torch.Size([8192, 28672]), model_parallel True, dtype torch.bfloat16 +[01:45:45.835111] Param llma.layers.3.feed_forward.w2.bias: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.835124] Param llma.layers.3.feed_forward.w3.weight: requires_grad False, local_size torch.Size([28672, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.835136] Param llma.layers.3.feed_forward.w3.bias: requires_grad True, local_size torch.Size([28672]), model_parallel True, dtype torch.float32 +[01:45:45.835149] Param llma.layers.3.attention_norm.weight: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.835162] Param llma.layers.3.ffn_norm.weight: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.835178] Param llma.layers.4.attention.wq.weight: requires_grad False, local_size torch.Size([8192, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.835191] Param llma.layers.4.attention.wq.bias: requires_grad True, local_size torch.Size([8192]), model_parallel True, dtype torch.float32 +[01:45:45.835205] Param llma.layers.4.attention.wk.weight: requires_grad False, local_size torch.Size([1024, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.835216] Param llma.layers.4.attention.wk.bias: requires_grad True, local_size torch.Size([1024]), model_parallel True, dtype torch.float32 +[01:45:45.835230] Param llma.layers.4.attention.wv.weight: requires_grad False, local_size torch.Size([1024, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.835241] Param llma.layers.4.attention.wv.bias: requires_grad True, local_size torch.Size([1024]), model_parallel True, dtype torch.float32 +[01:45:45.835255] Param llma.layers.4.attention.wo.weight: requires_grad False, local_size torch.Size([8192, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.835267] Param llma.layers.4.attention.wo.bias: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.835281] Param llma.layers.4.feed_forward.w1.weight: requires_grad False, local_size torch.Size([28672, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.835292] Param llma.layers.4.feed_forward.w1.bias: requires_grad True, local_size torch.Size([28672]), model_parallel True, dtype torch.float32 +[01:45:45.835306] Param llma.layers.4.feed_forward.w2.weight: requires_grad False, local_size torch.Size([8192, 28672]), model_parallel True, dtype torch.bfloat16 +[01:45:45.835319] Param llma.layers.4.feed_forward.w2.bias: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.835332] Param llma.layers.4.feed_forward.w3.weight: requires_grad False, local_size torch.Size([28672, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.835344] Param llma.layers.4.feed_forward.w3.bias: requires_grad True, local_size torch.Size([28672]), model_parallel True, dtype torch.float32 +[01:45:45.835357] Param llma.layers.4.attention_norm.weight: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.835371] Param llma.layers.4.ffn_norm.weight: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.835386] Param llma.layers.5.attention.wq.weight: requires_grad False, local_size torch.Size([8192, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.835398] Param llma.layers.5.attention.wq.bias: requires_grad True, local_size torch.Size([8192]), model_parallel True, dtype torch.float32 +[01:45:45.835412] Param llma.layers.5.attention.wk.weight: requires_grad False, local_size torch.Size([1024, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.835423] Param llma.layers.5.attention.wk.bias: requires_grad True, local_size torch.Size([1024]), model_parallel True, dtype torch.float32 +[01:45:45.835436] Param llma.layers.5.attention.wv.weight: requires_grad False, local_size torch.Size([1024, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.835448] Param llma.layers.5.attention.wv.bias: requires_grad True, local_size torch.Size([1024]), model_parallel True, dtype torch.float32 +[01:45:45.835461] Param llma.layers.5.attention.wo.weight: requires_grad False, local_size torch.Size([8192, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.835473] Param llma.layers.5.attention.wo.bias: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.835488] Param llma.layers.5.feed_forward.w1.weight: requires_grad False, local_size torch.Size([28672, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.835500] Param llma.layers.5.feed_forward.w1.bias: requires_grad True, local_size torch.Size([28672]), model_parallel True, dtype torch.float32 +[01:45:45.835513] Param llma.layers.5.feed_forward.w2.weight: requires_grad False, local_size torch.Size([8192, 28672]), model_parallel True, dtype torch.bfloat16 +[01:45:45.835525] Param llma.layers.5.feed_forward.w2.bias: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.835539] Param llma.layers.5.feed_forward.w3.weight: requires_grad False, local_size torch.Size([28672, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.835551] Param llma.layers.5.feed_forward.w3.bias: requires_grad True, local_size torch.Size([28672]), model_parallel True, dtype torch.float32 +[01:45:45.835564] Param llma.layers.5.attention_norm.weight: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.835579] Param llma.layers.5.ffn_norm.weight: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.835594] Param llma.layers.6.attention.wq.weight: requires_grad False, local_size torch.Size([8192, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.835607] Param llma.layers.6.attention.wq.bias: requires_grad True, local_size torch.Size([8192]), model_parallel True, dtype torch.float32 +[01:45:45.835620] Param llma.layers.6.attention.wk.weight: requires_grad False, local_size torch.Size([1024, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.835632] Param llma.layers.6.attention.wk.bias: requires_grad True, local_size torch.Size([1024]), model_parallel True, dtype torch.float32 +[01:45:45.835645] Param llma.layers.6.attention.wv.weight: requires_grad False, local_size torch.Size([1024, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.835657] Param llma.layers.6.attention.wv.bias: requires_grad True, local_size torch.Size([1024]), model_parallel True, dtype torch.float32 +[01:45:45.835670] Param llma.layers.6.attention.wo.weight: requires_grad False, local_size torch.Size([8192, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.835682] Param llma.layers.6.attention.wo.bias: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.835696] Param llma.layers.6.feed_forward.w1.weight: requires_grad False, local_size torch.Size([28672, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.835708] Param llma.layers.6.feed_forward.w1.bias: requires_grad True, local_size torch.Size([28672]), model_parallel True, dtype torch.float32 +[01:45:45.835721] Param llma.layers.6.feed_forward.w2.weight: requires_grad False, local_size torch.Size([8192, 28672]), model_parallel True, dtype torch.bfloat16 +[01:45:45.835733] Param llma.layers.6.feed_forward.w2.bias: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.835746] Param llma.layers.6.feed_forward.w3.weight: requires_grad False, local_size torch.Size([28672, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.835758] Param llma.layers.6.feed_forward.w3.bias: requires_grad True, local_size torch.Size([28672]), model_parallel True, dtype torch.float32 +[01:45:45.835771] Param llma.layers.6.attention_norm.weight: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.835784] Param llma.layers.6.ffn_norm.weight: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.835799] Param llma.layers.7.attention.wq.weight: requires_grad False, local_size torch.Size([8192, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.835811] Param llma.layers.7.attention.wq.bias: requires_grad True, local_size torch.Size([8192]), model_parallel True, dtype torch.float32 +[01:45:45.835824] Param llma.layers.7.attention.wk.weight: requires_grad False, local_size torch.Size([1024, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.835836] Param llma.layers.7.attention.wk.bias: requires_grad True, local_size torch.Size([1024]), model_parallel True, dtype torch.float32 +[01:45:45.835849] Param llma.layers.7.attention.wv.weight: requires_grad False, local_size torch.Size([1024, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.835861] Param llma.layers.7.attention.wv.bias: requires_grad True, local_size torch.Size([1024]), model_parallel True, dtype torch.float32 +[01:45:45.835874] Param llma.layers.7.attention.wo.weight: requires_grad False, local_size torch.Size([8192, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.835885] Param llma.layers.7.attention.wo.bias: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.835900] Param llma.layers.7.feed_forward.w1.weight: requires_grad False, local_size torch.Size([28672, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.835912] Param llma.layers.7.feed_forward.w1.bias: requires_grad True, local_size torch.Size([28672]), model_parallel True, dtype torch.float32 +[01:45:45.835925] Param llma.layers.7.feed_forward.w2.weight: requires_grad False, local_size torch.Size([8192, 28672]), model_parallel True, dtype torch.bfloat16 +[01:45:45.835937] Param llma.layers.7.feed_forward.w2.bias: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.835950] Param llma.layers.7.feed_forward.w3.weight: requires_grad False, local_size torch.Size([28672, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.835962] Param llma.layers.7.feed_forward.w3.bias: requires_grad True, local_size torch.Size([28672]), model_parallel True, dtype torch.float32 +[01:45:45.835975] Param llma.layers.7.attention_norm.weight: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.835988] Param llma.layers.7.ffn_norm.weight: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.836004] Param llma.layers.8.attention.wq.weight: requires_grad False, local_size torch.Size([8192, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.836016] Param llma.layers.8.attention.wq.bias: requires_grad True, local_size torch.Size([8192]), model_parallel True, dtype torch.float32 +[01:45:45.836029] Param llma.layers.8.attention.wk.weight: requires_grad False, local_size torch.Size([1024, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.836041] Param llma.layers.8.attention.wk.bias: requires_grad True, local_size torch.Size([1024]), model_parallel True, dtype torch.float32 +[01:45:45.836062] Param llma.layers.8.attention.wv.weight: requires_grad False, local_size torch.Size([1024, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.836074] Param llma.layers.8.attention.wv.bias: requires_grad True, local_size torch.Size([1024]), model_parallel True, dtype torch.float32 +[01:45:45.836088] Param llma.layers.8.attention.wo.weight: requires_grad False, local_size torch.Size([8192, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.836100] Param llma.layers.8.attention.wo.bias: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.836114] Param llma.layers.8.feed_forward.w1.weight: requires_grad False, local_size torch.Size([28672, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.836126] Param llma.layers.8.feed_forward.w1.bias: requires_grad True, local_size torch.Size([28672]), model_parallel True, dtype torch.float32 +[01:45:45.836139] Param llma.layers.8.feed_forward.w2.weight: requires_grad False, local_size torch.Size([8192, 28672]), model_parallel True, dtype torch.bfloat16 +[01:45:45.836151] Param llma.layers.8.feed_forward.w2.bias: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.836164] Param llma.layers.8.feed_forward.w3.weight: requires_grad False, local_size torch.Size([28672, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.836176] Param llma.layers.8.feed_forward.w3.bias: requires_grad True, local_size torch.Size([28672]), model_parallel True, dtype torch.float32 +[01:45:45.836189] Param llma.layers.8.attention_norm.weight: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.836202] Param llma.layers.8.ffn_norm.weight: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.836219] Param llma.layers.9.attention.wq.weight: requires_grad False, local_size torch.Size([8192, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.836231] Param llma.layers.9.attention.wq.bias: requires_grad True, local_size torch.Size([8192]), model_parallel True, dtype torch.float32 +[01:45:45.836244] Param llma.layers.9.attention.wk.weight: requires_grad False, local_size torch.Size([1024, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.836256] Param llma.layers.9.attention.wk.bias: requires_grad True, local_size torch.Size([1024]), model_parallel True, dtype torch.float32 +[01:45:45.836269] Param llma.layers.9.attention.wv.weight: requires_grad False, local_size torch.Size([1024, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.836281] Param llma.layers.9.attention.wv.bias: requires_grad True, local_size torch.Size([1024]), model_parallel True, dtype torch.float32 +[01:45:45.836294] Param llma.layers.9.attention.wo.weight: requires_grad False, local_size torch.Size([8192, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.836305] Param llma.layers.9.attention.wo.bias: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.836320] Param llma.layers.9.feed_forward.w1.weight: requires_grad False, local_size torch.Size([28672, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.836332] Param llma.layers.9.feed_forward.w1.bias: requires_grad True, local_size torch.Size([28672]), model_parallel True, dtype torch.float32 +[01:45:45.836346] Param llma.layers.9.feed_forward.w2.weight: requires_grad False, local_size torch.Size([8192, 28672]), model_parallel True, dtype torch.bfloat16 +[01:45:45.836358] Param llma.layers.9.feed_forward.w2.bias: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.836371] Param llma.layers.9.feed_forward.w3.weight: requires_grad False, local_size torch.Size([28672, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.836383] Param llma.layers.9.feed_forward.w3.bias: requires_grad True, local_size torch.Size([28672]), model_parallel True, dtype torch.float32 +[01:45:45.836396] Param llma.layers.9.attention_norm.weight: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.836409] Param llma.layers.9.ffn_norm.weight: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.836424] Param llma.layers.10.attention.wq.weight: requires_grad False, local_size torch.Size([8192, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.836436] Param llma.layers.10.attention.wq.bias: requires_grad True, local_size torch.Size([8192]), model_parallel True, dtype torch.float32 +[01:45:45.836449] Param llma.layers.10.attention.wk.weight: requires_grad False, local_size torch.Size([1024, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.836461] Param llma.layers.10.attention.wk.bias: requires_grad True, local_size torch.Size([1024]), model_parallel True, dtype torch.float32 +[01:45:45.836474] Param llma.layers.10.attention.wv.weight: requires_grad False, local_size torch.Size([1024, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.836485] Param llma.layers.10.attention.wv.bias: requires_grad True, local_size torch.Size([1024]), model_parallel True, dtype torch.float32 +[01:45:45.836498] Param llma.layers.10.attention.wo.weight: requires_grad False, local_size torch.Size([8192, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.836510] Param llma.layers.10.attention.wo.bias: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.836524] Param llma.layers.10.feed_forward.w1.weight: requires_grad False, local_size torch.Size([28672, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.836536] Param llma.layers.10.feed_forward.w1.bias: requires_grad True, local_size torch.Size([28672]), model_parallel True, dtype torch.float32 +[01:45:45.836549] Param llma.layers.10.feed_forward.w2.weight: requires_grad False, local_size torch.Size([8192, 28672]), model_parallel True, dtype torch.bfloat16 +[01:45:45.836561] Param llma.layers.10.feed_forward.w2.bias: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.836575] Param llma.layers.10.feed_forward.w3.weight: requires_grad False, local_size torch.Size([28672, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.836586] Param llma.layers.10.feed_forward.w3.bias: requires_grad True, local_size torch.Size([28672]), model_parallel True, dtype torch.float32 +[01:45:45.836599] Param llma.layers.10.attention_norm.weight: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.836612] Param llma.layers.10.ffn_norm.weight: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.836628] Param llma.layers.11.attention.wq.weight: requires_grad False, local_size torch.Size([8192, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.836640] Param llma.layers.11.attention.wq.bias: requires_grad True, local_size torch.Size([8192]), model_parallel True, dtype torch.float32 +[01:45:45.836653] Param llma.layers.11.attention.wk.weight: requires_grad False, local_size torch.Size([1024, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.836665] Param llma.layers.11.attention.wk.bias: requires_grad True, local_size torch.Size([1024]), model_parallel True, dtype torch.float32 +[01:45:45.836678] Param llma.layers.11.attention.wv.weight: requires_grad False, local_size torch.Size([1024, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.836690] Param llma.layers.11.attention.wv.bias: requires_grad True, local_size torch.Size([1024]), model_parallel True, dtype torch.float32 +[01:45:45.836703] Param llma.layers.11.attention.wo.weight: requires_grad False, local_size torch.Size([8192, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.836715] Param llma.layers.11.attention.wo.bias: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.836729] Param llma.layers.11.feed_forward.w1.weight: requires_grad False, local_size torch.Size([28672, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.836740] Param llma.layers.11.feed_forward.w1.bias: requires_grad True, local_size torch.Size([28672]), model_parallel True, dtype torch.float32 +[01:45:45.836754] Param llma.layers.11.feed_forward.w2.weight: requires_grad False, local_size torch.Size([8192, 28672]), model_parallel True, dtype torch.bfloat16 +[01:45:45.836766] Param llma.layers.11.feed_forward.w2.bias: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.836779] Param llma.layers.11.feed_forward.w3.weight: requires_grad False, local_size torch.Size([28672, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.836790] Param llma.layers.11.feed_forward.w3.bias: requires_grad True, local_size torch.Size([28672]), model_parallel True, dtype torch.float32 +[01:45:45.836804] Param llma.layers.11.attention_norm.weight: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.836817] Param llma.layers.11.ffn_norm.weight: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.836832] Param llma.layers.12.attention.wq.weight: requires_grad False, local_size torch.Size([8192, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.836844] Param llma.layers.12.attention.wq.bias: requires_grad True, local_size torch.Size([8192]), model_parallel True, dtype torch.float32 +[01:45:45.836857] Param llma.layers.12.attention.wk.weight: requires_grad False, local_size torch.Size([1024, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.836869] Param llma.layers.12.attention.wk.bias: requires_grad True, local_size torch.Size([1024]), model_parallel True, dtype torch.float32 +[01:45:45.836882] Param llma.layers.12.attention.wv.weight: requires_grad False, local_size torch.Size([1024, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.836894] Param llma.layers.12.attention.wv.bias: requires_grad True, local_size torch.Size([1024]), model_parallel True, dtype torch.float32 +[01:45:45.836907] Param llma.layers.12.attention.wo.weight: requires_grad False, local_size torch.Size([8192, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.836919] Param llma.layers.12.attention.wo.bias: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.836933] Param llma.layers.12.feed_forward.w1.weight: requires_grad False, local_size torch.Size([28672, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.836945] Param llma.layers.12.feed_forward.w1.bias: requires_grad True, local_size torch.Size([28672]), model_parallel True, dtype torch.float32 +[01:45:45.836958] Param llma.layers.12.feed_forward.w2.weight: requires_grad False, local_size torch.Size([8192, 28672]), model_parallel True, dtype torch.bfloat16 +[01:45:45.836970] Param llma.layers.12.feed_forward.w2.bias: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.836984] Param llma.layers.12.feed_forward.w3.weight: requires_grad False, local_size torch.Size([28672, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.836995] Param llma.layers.12.feed_forward.w3.bias: requires_grad True, local_size torch.Size([28672]), model_parallel True, dtype torch.float32 +[01:45:45.837008] Param llma.layers.12.attention_norm.weight: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.837021] Param llma.layers.12.ffn_norm.weight: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.837037] Param llma.layers.13.attention.wq.weight: requires_grad False, local_size torch.Size([8192, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.837049] Param llma.layers.13.attention.wq.bias: requires_grad True, local_size torch.Size([8192]), model_parallel True, dtype torch.float32 +[01:45:45.837070] Param llma.layers.13.attention.wk.weight: requires_grad False, local_size torch.Size([1024, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.837082] Param llma.layers.13.attention.wk.bias: requires_grad True, local_size torch.Size([1024]), model_parallel True, dtype torch.float32 +[01:45:45.837096] Param llma.layers.13.attention.wv.weight: requires_grad False, local_size torch.Size([1024, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.837108] Param llma.layers.13.attention.wv.bias: requires_grad True, local_size torch.Size([1024]), model_parallel True, dtype torch.float32 +[01:45:45.837121] Param llma.layers.13.attention.wo.weight: requires_grad False, local_size torch.Size([8192, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.837133] Param llma.layers.13.attention.wo.bias: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.837147] Param llma.layers.13.feed_forward.w1.weight: requires_grad False, local_size torch.Size([28672, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.837159] Param llma.layers.13.feed_forward.w1.bias: requires_grad True, local_size torch.Size([28672]), model_parallel True, dtype torch.float32 +[01:45:45.837173] Param llma.layers.13.feed_forward.w2.weight: requires_grad False, local_size torch.Size([8192, 28672]), model_parallel True, dtype torch.bfloat16 +[01:45:45.837184] Param llma.layers.13.feed_forward.w2.bias: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.837197] Param llma.layers.13.feed_forward.w3.weight: requires_grad False, local_size torch.Size([28672, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.837209] Param llma.layers.13.feed_forward.w3.bias: requires_grad True, local_size torch.Size([28672]), model_parallel True, dtype torch.float32 +[01:45:45.837222] Param llma.layers.13.attention_norm.weight: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.837235] Param llma.layers.13.ffn_norm.weight: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.837251] Param llma.layers.14.attention.wq.weight: requires_grad False, local_size torch.Size([8192, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.837263] Param llma.layers.14.attention.wq.bias: requires_grad True, local_size torch.Size([8192]), model_parallel True, dtype torch.float32 +[01:45:45.837276] Param llma.layers.14.attention.wk.weight: requires_grad False, local_size torch.Size([1024, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.837288] Param llma.layers.14.attention.wk.bias: requires_grad True, local_size torch.Size([1024]), model_parallel True, dtype torch.float32 +[01:45:45.837302] Param llma.layers.14.attention.wv.weight: requires_grad False, local_size torch.Size([1024, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.837314] Param llma.layers.14.attention.wv.bias: requires_grad True, local_size torch.Size([1024]), model_parallel True, dtype torch.float32 +[01:45:45.837327] Param llma.layers.14.attention.wo.weight: requires_grad False, local_size torch.Size([8192, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.837339] Param llma.layers.14.attention.wo.bias: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.837353] Param llma.layers.14.feed_forward.w1.weight: requires_grad False, local_size torch.Size([28672, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.837365] Param llma.layers.14.feed_forward.w1.bias: requires_grad True, local_size torch.Size([28672]), model_parallel True, dtype torch.float32 +[01:45:45.837378] Param llma.layers.14.feed_forward.w2.weight: requires_grad False, local_size torch.Size([8192, 28672]), model_parallel True, dtype torch.bfloat16 +[01:45:45.837390] Param llma.layers.14.feed_forward.w2.bias: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.837403] Param llma.layers.14.feed_forward.w3.weight: requires_grad False, local_size torch.Size([28672, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.837415] Param llma.layers.14.feed_forward.w3.bias: requires_grad True, local_size torch.Size([28672]), model_parallel True, dtype torch.float32 +[01:45:45.837428] Param llma.layers.14.attention_norm.weight: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.837441] Param llma.layers.14.ffn_norm.weight: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.837456] Param llma.layers.15.attention.wq.weight: requires_grad False, local_size torch.Size([8192, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.837469] Param llma.layers.15.attention.wq.bias: requires_grad True, local_size torch.Size([8192]), model_parallel True, dtype torch.float32 +[01:45:45.837482] Param llma.layers.15.attention.wk.weight: requires_grad False, local_size torch.Size([1024, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.837494] Param llma.layers.15.attention.wk.bias: requires_grad True, local_size torch.Size([1024]), model_parallel True, dtype torch.float32 +[01:45:45.837507] Param llma.layers.15.attention.wv.weight: requires_grad False, local_size torch.Size([1024, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.837519] Param llma.layers.15.attention.wv.bias: requires_grad True, local_size torch.Size([1024]), model_parallel True, dtype torch.float32 +[01:45:45.837532] Param llma.layers.15.attention.wo.weight: requires_grad False, local_size torch.Size([8192, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.837544] Param llma.layers.15.attention.wo.bias: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.837558] Param llma.layers.15.feed_forward.w1.weight: requires_grad False, local_size torch.Size([28672, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.837570] Param llma.layers.15.feed_forward.w1.bias: requires_grad True, local_size torch.Size([28672]), model_parallel True, dtype torch.float32 +[01:45:45.837584] Param llma.layers.15.feed_forward.w2.weight: requires_grad False, local_size torch.Size([8192, 28672]), model_parallel True, dtype torch.bfloat16 +[01:45:45.837596] Param llma.layers.15.feed_forward.w2.bias: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.837609] Param llma.layers.15.feed_forward.w3.weight: requires_grad False, local_size torch.Size([28672, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.837621] Param llma.layers.15.feed_forward.w3.bias: requires_grad True, local_size torch.Size([28672]), model_parallel True, dtype torch.float32 +[01:45:45.837634] Param llma.layers.15.attention_norm.weight: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.837647] Param llma.layers.15.ffn_norm.weight: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.837663] Param llma.layers.16.attention.wq.weight: requires_grad False, local_size torch.Size([8192, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.837675] Param llma.layers.16.attention.wq.bias: requires_grad True, local_size torch.Size([8192]), model_parallel True, dtype torch.float32 +[01:45:45.837688] Param llma.layers.16.attention.wk.weight: requires_grad False, local_size torch.Size([1024, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.837700] Param llma.layers.16.attention.wk.bias: requires_grad True, local_size torch.Size([1024]), model_parallel True, dtype torch.float32 +[01:45:45.837713] Param llma.layers.16.attention.wv.weight: requires_grad False, local_size torch.Size([1024, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.837725] Param llma.layers.16.attention.wv.bias: requires_grad True, local_size torch.Size([1024]), model_parallel True, dtype torch.float32 +[01:45:45.837738] Param llma.layers.16.attention.wo.weight: requires_grad False, local_size torch.Size([8192, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.837750] Param llma.layers.16.attention.wo.bias: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.837764] Param llma.layers.16.feed_forward.w1.weight: requires_grad False, local_size torch.Size([28672, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.837776] Param llma.layers.16.feed_forward.w1.bias: requires_grad True, local_size torch.Size([28672]), model_parallel True, dtype torch.float32 +[01:45:45.837789] Param llma.layers.16.feed_forward.w2.weight: requires_grad False, local_size torch.Size([8192, 28672]), model_parallel True, dtype torch.bfloat16 +[01:45:45.837801] Param llma.layers.16.feed_forward.w2.bias: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.837814] Param llma.layers.16.feed_forward.w3.weight: requires_grad False, local_size torch.Size([28672, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.837826] Param llma.layers.16.feed_forward.w3.bias: requires_grad True, local_size torch.Size([28672]), model_parallel True, dtype torch.float32 +[01:45:45.837839] Param llma.layers.16.attention_norm.weight: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.837852] Param llma.layers.16.ffn_norm.weight: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.837867] Param llma.layers.17.attention.wq.weight: requires_grad False, local_size torch.Size([8192, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.837879] Param llma.layers.17.attention.wq.bias: requires_grad True, local_size torch.Size([8192]), model_parallel True, dtype torch.float32 +[01:45:45.837892] Param llma.layers.17.attention.wk.weight: requires_grad False, local_size torch.Size([1024, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.837904] Param llma.layers.17.attention.wk.bias: requires_grad True, local_size torch.Size([1024]), model_parallel True, dtype torch.float32 +[01:45:45.837918] Param llma.layers.17.attention.wv.weight: requires_grad False, local_size torch.Size([1024, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.837929] Param llma.layers.17.attention.wv.bias: requires_grad True, local_size torch.Size([1024]), model_parallel True, dtype torch.float32 +[01:45:45.837943] Param llma.layers.17.attention.wo.weight: requires_grad False, local_size torch.Size([8192, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.837955] Param llma.layers.17.attention.wo.bias: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.837969] Param llma.layers.17.feed_forward.w1.weight: requires_grad False, local_size torch.Size([28672, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.837981] Param llma.layers.17.feed_forward.w1.bias: requires_grad True, local_size torch.Size([28672]), model_parallel True, dtype torch.float32 +[01:45:45.837994] Param llma.layers.17.feed_forward.w2.weight: requires_grad False, local_size torch.Size([8192, 28672]), model_parallel True, dtype torch.bfloat16 +[01:45:45.838007] Param llma.layers.17.feed_forward.w2.bias: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.838020] Param llma.layers.17.feed_forward.w3.weight: requires_grad False, local_size torch.Size([28672, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.838032] Param llma.layers.17.feed_forward.w3.bias: requires_grad True, local_size torch.Size([28672]), model_parallel True, dtype torch.float32 +[01:45:45.838045] Param llma.layers.17.attention_norm.weight: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.838066] Param llma.layers.17.ffn_norm.weight: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.838082] Param llma.layers.18.attention.wq.weight: requires_grad False, local_size torch.Size([8192, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.838094] Param llma.layers.18.attention.wq.bias: requires_grad True, local_size torch.Size([8192]), model_parallel True, dtype torch.float32 +[01:45:45.838107] Param llma.layers.18.attention.wk.weight: requires_grad False, local_size torch.Size([1024, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.838118] Param llma.layers.18.attention.wk.bias: requires_grad True, local_size torch.Size([1024]), model_parallel True, dtype torch.float32 +[01:45:45.838132] Param llma.layers.18.attention.wv.weight: requires_grad False, local_size torch.Size([1024, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.838144] Param llma.layers.18.attention.wv.bias: requires_grad True, local_size torch.Size([1024]), model_parallel True, dtype torch.float32 +[01:45:45.838157] Param llma.layers.18.attention.wo.weight: requires_grad False, local_size torch.Size([8192, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.838169] Param llma.layers.18.attention.wo.bias: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.838183] Param llma.layers.18.feed_forward.w1.weight: requires_grad False, local_size torch.Size([28672, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.838195] Param llma.layers.18.feed_forward.w1.bias: requires_grad True, local_size torch.Size([28672]), model_parallel True, dtype torch.float32 +[01:45:45.838208] Param llma.layers.18.feed_forward.w2.weight: requires_grad False, local_size torch.Size([8192, 28672]), model_parallel True, dtype torch.bfloat16 +[01:45:45.838219] Param llma.layers.18.feed_forward.w2.bias: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.838232] Param llma.layers.18.feed_forward.w3.weight: requires_grad False, local_size torch.Size([28672, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.838244] Param llma.layers.18.feed_forward.w3.bias: requires_grad True, local_size torch.Size([28672]), model_parallel True, dtype torch.float32 +[01:45:45.838257] Param llma.layers.18.attention_norm.weight: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.838270] Param llma.layers.18.ffn_norm.weight: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.838286] Param llma.layers.19.attention.wq.weight: requires_grad False, local_size torch.Size([8192, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.838302] Param llma.layers.19.attention.wq.bias: requires_grad True, local_size torch.Size([8192]), model_parallel True, dtype torch.float32 +[01:45:45.838315] Param llma.layers.19.attention.wk.weight: requires_grad False, local_size torch.Size([1024, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.838327] Param llma.layers.19.attention.wk.bias: requires_grad True, local_size torch.Size([1024]), model_parallel True, dtype torch.float32 +[01:45:45.838341] Param llma.layers.19.attention.wv.weight: requires_grad False, local_size torch.Size([1024, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.838352] Param llma.layers.19.attention.wv.bias: requires_grad True, local_size torch.Size([1024]), model_parallel True, dtype torch.float32 +[01:45:45.838365] Param llma.layers.19.attention.wo.weight: requires_grad False, local_size torch.Size([8192, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.838377] Param llma.layers.19.attention.wo.bias: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.838391] Param llma.layers.19.feed_forward.w1.weight: requires_grad False, local_size torch.Size([28672, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.838403] Param llma.layers.19.feed_forward.w1.bias: requires_grad True, local_size torch.Size([28672]), model_parallel True, dtype torch.float32 +[01:45:45.838416] Param llma.layers.19.feed_forward.w2.weight: requires_grad False, local_size torch.Size([8192, 28672]), model_parallel True, dtype torch.bfloat16 +[01:45:45.838428] Param llma.layers.19.feed_forward.w2.bias: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.838441] Param llma.layers.19.feed_forward.w3.weight: requires_grad False, local_size torch.Size([28672, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.838453] Param llma.layers.19.feed_forward.w3.bias: requires_grad True, local_size torch.Size([28672]), model_parallel True, dtype torch.float32 +[01:45:45.838466] Param llma.layers.19.attention_norm.weight: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.838479] Param llma.layers.19.ffn_norm.weight: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.838494] Param llma.layers.20.attention.wq.weight: requires_grad False, local_size torch.Size([8192, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.838506] Param llma.layers.20.attention.wq.bias: requires_grad True, local_size torch.Size([8192]), model_parallel True, dtype torch.float32 +[01:45:45.838520] Param llma.layers.20.attention.wk.weight: requires_grad False, local_size torch.Size([1024, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.838531] Param llma.layers.20.attention.wk.bias: requires_grad True, local_size torch.Size([1024]), model_parallel True, dtype torch.float32 +[01:45:45.838545] Param llma.layers.20.attention.wv.weight: requires_grad False, local_size torch.Size([1024, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.838557] Param llma.layers.20.attention.wv.bias: requires_grad True, local_size torch.Size([1024]), model_parallel True, dtype torch.float32 +[01:45:45.838570] Param llma.layers.20.attention.wo.weight: requires_grad False, local_size torch.Size([8192, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.838582] Param llma.layers.20.attention.wo.bias: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.838596] Param llma.layers.20.feed_forward.w1.weight: requires_grad False, local_size torch.Size([28672, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.838607] Param llma.layers.20.feed_forward.w1.bias: requires_grad True, local_size torch.Size([28672]), model_parallel True, dtype torch.float32 +[01:45:45.838621] Param llma.layers.20.feed_forward.w2.weight: requires_grad False, local_size torch.Size([8192, 28672]), model_parallel True, dtype torch.bfloat16 +[01:45:45.838632] Param llma.layers.20.feed_forward.w2.bias: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.838646] Param llma.layers.20.feed_forward.w3.weight: requires_grad False, local_size torch.Size([28672, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.838657] Param llma.layers.20.feed_forward.w3.bias: requires_grad True, local_size torch.Size([28672]), model_parallel True, dtype torch.float32 +[01:45:45.838671] Param llma.layers.20.attention_norm.weight: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.838684] Param llma.layers.20.ffn_norm.weight: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.838706] Param llma.layers.21.attention.wq.weight: requires_grad False, local_size torch.Size([8192, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.838718] Param llma.layers.21.attention.wq.bias: requires_grad True, local_size torch.Size([8192]), model_parallel True, dtype torch.float32 +[01:45:45.838732] Param llma.layers.21.attention.wk.weight: requires_grad False, local_size torch.Size([1024, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.838744] Param llma.layers.21.attention.wk.bias: requires_grad True, local_size torch.Size([1024]), model_parallel True, dtype torch.float32 +[01:45:45.838757] Param llma.layers.21.attention.wv.weight: requires_grad False, local_size torch.Size([1024, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.838769] Param llma.layers.21.attention.wv.bias: requires_grad True, local_size torch.Size([1024]), model_parallel True, dtype torch.float32 +[01:45:45.838783] Param llma.layers.21.attention.wo.weight: requires_grad False, local_size torch.Size([8192, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.838794] Param llma.layers.21.attention.wo.bias: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.838809] Param llma.layers.21.feed_forward.w1.weight: requires_grad False, local_size torch.Size([28672, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.838820] Param llma.layers.21.feed_forward.w1.bias: requires_grad True, local_size torch.Size([28672]), model_parallel True, dtype torch.float32 +[01:45:45.838834] Param llma.layers.21.feed_forward.w2.weight: requires_grad False, local_size torch.Size([8192, 28672]), model_parallel True, dtype torch.bfloat16 +[01:45:45.838845] Param llma.layers.21.feed_forward.w2.bias: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.838858] Param llma.layers.21.feed_forward.w3.weight: requires_grad False, local_size torch.Size([28672, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.838870] Param llma.layers.21.feed_forward.w3.bias: requires_grad True, local_size torch.Size([28672]), model_parallel True, dtype torch.float32 +[01:45:45.838883] Param llma.layers.21.attention_norm.weight: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.838896] Param llma.layers.21.ffn_norm.weight: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.838912] Param llma.layers.22.attention.wq.weight: requires_grad False, local_size torch.Size([8192, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.838924] Param llma.layers.22.attention.wq.bias: requires_grad True, local_size torch.Size([8192]), model_parallel True, dtype torch.float32 +[01:45:45.838938] Param llma.layers.22.attention.wk.weight: requires_grad False, local_size torch.Size([1024, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.838949] Param llma.layers.22.attention.wk.bias: requires_grad True, local_size torch.Size([1024]), model_parallel True, dtype torch.float32 +[01:45:45.838963] Param llma.layers.22.attention.wv.weight: requires_grad False, local_size torch.Size([1024, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.838974] Param llma.layers.22.attention.wv.bias: requires_grad True, local_size torch.Size([1024]), model_parallel True, dtype torch.float32 +[01:45:45.838988] Param llma.layers.22.attention.wo.weight: requires_grad False, local_size torch.Size([8192, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.839000] Param llma.layers.22.attention.wo.bias: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.839015] Param llma.layers.22.feed_forward.w1.weight: requires_grad False, local_size torch.Size([28672, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.839027] Param llma.layers.22.feed_forward.w1.bias: requires_grad True, local_size torch.Size([28672]), model_parallel True, dtype torch.float32 +[01:45:45.839040] Param llma.layers.22.feed_forward.w2.weight: requires_grad False, local_size torch.Size([8192, 28672]), model_parallel True, dtype torch.bfloat16 +[01:45:45.839059] Param llma.layers.22.feed_forward.w2.bias: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.839073] Param llma.layers.22.feed_forward.w3.weight: requires_grad False, local_size torch.Size([28672, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.839085] Param llma.layers.22.feed_forward.w3.bias: requires_grad True, local_size torch.Size([28672]), model_parallel True, dtype torch.float32 +[01:45:45.839098] Param llma.layers.22.attention_norm.weight: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.839111] Param llma.layers.22.ffn_norm.weight: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.839127] Param llma.layers.23.attention.wq.weight: requires_grad False, local_size torch.Size([8192, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.839138] Param llma.layers.23.attention.wq.bias: requires_grad True, local_size torch.Size([8192]), model_parallel True, dtype torch.float32 +[01:45:45.839152] Param llma.layers.23.attention.wk.weight: requires_grad False, local_size torch.Size([1024, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.839164] Param llma.layers.23.attention.wk.bias: requires_grad True, local_size torch.Size([1024]), model_parallel True, dtype torch.float32 +[01:45:45.839177] Param llma.layers.23.attention.wv.weight: requires_grad False, local_size torch.Size([1024, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.839189] Param llma.layers.23.attention.wv.bias: requires_grad True, local_size torch.Size([1024]), model_parallel True, dtype torch.float32 +[01:45:45.839202] Param llma.layers.23.attention.wo.weight: requires_grad False, local_size torch.Size([8192, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.839214] Param llma.layers.23.attention.wo.bias: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.839228] Param llma.layers.23.feed_forward.w1.weight: requires_grad False, local_size torch.Size([28672, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.839240] Param llma.layers.23.feed_forward.w1.bias: requires_grad True, local_size torch.Size([28672]), model_parallel True, dtype torch.float32 +[01:45:45.839254] Param llma.layers.23.feed_forward.w2.weight: requires_grad False, local_size torch.Size([8192, 28672]), model_parallel True, dtype torch.bfloat16 +[01:45:45.839266] Param llma.layers.23.feed_forward.w2.bias: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.839279] Param llma.layers.23.feed_forward.w3.weight: requires_grad False, local_size torch.Size([28672, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.839291] Param llma.layers.23.feed_forward.w3.bias: requires_grad True, local_size torch.Size([28672]), model_parallel True, dtype torch.float32 +[01:45:45.839304] Param llma.layers.23.attention_norm.weight: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.839318] Param llma.layers.23.ffn_norm.weight: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.839333] Param llma.layers.24.attention.wq.weight: requires_grad False, local_size torch.Size([8192, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.839345] Param llma.layers.24.attention.wq.bias: requires_grad True, local_size torch.Size([8192]), model_parallel True, dtype torch.float32 +[01:45:45.839359] Param llma.layers.24.attention.wk.weight: requires_grad False, local_size torch.Size([1024, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.839371] Param llma.layers.24.attention.wk.bias: requires_grad True, local_size torch.Size([1024]), model_parallel True, dtype torch.float32 +[01:45:45.839384] Param llma.layers.24.attention.wv.weight: requires_grad False, local_size torch.Size([1024, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.839397] Param llma.layers.24.attention.wv.bias: requires_grad True, local_size torch.Size([1024]), model_parallel True, dtype torch.float32 +[01:45:45.839410] Param llma.layers.24.attention.wo.weight: requires_grad False, local_size torch.Size([8192, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.839421] Param llma.layers.24.attention.wo.bias: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.839436] Param llma.layers.24.feed_forward.w1.weight: requires_grad False, local_size torch.Size([28672, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.839447] Param llma.layers.24.feed_forward.w1.bias: requires_grad True, local_size torch.Size([28672]), model_parallel True, dtype torch.float32 +[01:45:45.839460] Param llma.layers.24.feed_forward.w2.weight: requires_grad False, local_size torch.Size([8192, 28672]), model_parallel True, dtype torch.bfloat16 +[01:45:45.839472] Param llma.layers.24.feed_forward.w2.bias: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.839485] Param llma.layers.24.feed_forward.w3.weight: requires_grad False, local_size torch.Size([28672, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.839497] Param llma.layers.24.feed_forward.w3.bias: requires_grad True, local_size torch.Size([28672]), model_parallel True, dtype torch.float32 +[01:45:45.839510] Param llma.layers.24.attention_norm.weight: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.839522] Param llma.layers.24.ffn_norm.weight: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.839541] Param llma.layers.25.attention.wq.weight: requires_grad False, local_size torch.Size([8192, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.839554] Param llma.layers.25.attention.wq.bias: requires_grad True, local_size torch.Size([8192]), model_parallel True, dtype torch.float32 +[01:45:45.839567] Param llma.layers.25.attention.wk.weight: requires_grad False, local_size torch.Size([1024, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.839579] Param llma.layers.25.attention.wk.bias: requires_grad True, local_size torch.Size([1024]), model_parallel True, dtype torch.float32 +[01:45:45.839593] Param llma.layers.25.attention.wv.weight: requires_grad False, local_size torch.Size([1024, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.839604] Param llma.layers.25.attention.wv.bias: requires_grad True, local_size torch.Size([1024]), model_parallel True, dtype torch.float32 +[01:45:45.839617] Param llma.layers.25.attention.wo.weight: requires_grad False, local_size torch.Size([8192, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.839629] Param llma.layers.25.attention.wo.bias: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.839643] Param llma.layers.25.feed_forward.w1.weight: requires_grad False, local_size torch.Size([28672, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.839655] Param llma.layers.25.feed_forward.w1.bias: requires_grad True, local_size torch.Size([28672]), model_parallel True, dtype torch.float32 +[01:45:45.839668] Param llma.layers.25.feed_forward.w2.weight: requires_grad False, local_size torch.Size([8192, 28672]), model_parallel True, dtype torch.bfloat16 +[01:45:45.839680] Param llma.layers.25.feed_forward.w2.bias: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.839693] Param llma.layers.25.feed_forward.w3.weight: requires_grad False, local_size torch.Size([28672, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.839704] Param llma.layers.25.feed_forward.w3.bias: requires_grad True, local_size torch.Size([28672]), model_parallel True, dtype torch.float32 +[01:45:45.839718] Param llma.layers.25.attention_norm.weight: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.839731] Param llma.layers.25.ffn_norm.weight: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.839746] Param llma.layers.26.attention.wq.weight: requires_grad False, local_size torch.Size([8192, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.839758] Param llma.layers.26.attention.wq.bias: requires_grad True, local_size torch.Size([8192]), model_parallel True, dtype torch.float32 +[01:45:45.839771] Param llma.layers.26.attention.wk.weight: requires_grad False, local_size torch.Size([1024, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.839783] Param llma.layers.26.attention.wk.bias: requires_grad True, local_size torch.Size([1024]), model_parallel True, dtype torch.float32 +[01:45:45.839796] Param llma.layers.26.attention.wv.weight: requires_grad False, local_size torch.Size([1024, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.839808] Param llma.layers.26.attention.wv.bias: requires_grad True, local_size torch.Size([1024]), model_parallel True, dtype torch.float32 +[01:45:45.839821] Param llma.layers.26.attention.wo.weight: requires_grad False, local_size torch.Size([8192, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.839832] Param llma.layers.26.attention.wo.bias: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.839846] Param llma.layers.26.feed_forward.w1.weight: requires_grad False, local_size torch.Size([28672, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.839858] Param llma.layers.26.feed_forward.w1.bias: requires_grad True, local_size torch.Size([28672]), model_parallel True, dtype torch.float32 +[01:45:45.839871] Param llma.layers.26.feed_forward.w2.weight: requires_grad False, local_size torch.Size([8192, 28672]), model_parallel True, dtype torch.bfloat16 +[01:45:45.839883] Param llma.layers.26.feed_forward.w2.bias: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.839896] Param llma.layers.26.feed_forward.w3.weight: requires_grad False, local_size torch.Size([28672, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.839908] Param llma.layers.26.feed_forward.w3.bias: requires_grad True, local_size torch.Size([28672]), model_parallel True, dtype torch.float32 +[01:45:45.839921] Param llma.layers.26.attention_norm.weight: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.839934] Param llma.layers.26.ffn_norm.weight: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.839949] Param llma.layers.27.attention.wq.weight: requires_grad False, local_size torch.Size([8192, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.839961] Param llma.layers.27.attention.wq.bias: requires_grad True, local_size torch.Size([8192]), model_parallel True, dtype torch.float32 +[01:45:45.839974] Param llma.layers.27.attention.wk.weight: requires_grad False, local_size torch.Size([1024, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.839986] Param llma.layers.27.attention.wk.bias: requires_grad True, local_size torch.Size([1024]), model_parallel True, dtype torch.float32 +[01:45:45.840000] Param llma.layers.27.attention.wv.weight: requires_grad False, local_size torch.Size([1024, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.840012] Param llma.layers.27.attention.wv.bias: requires_grad True, local_size torch.Size([1024]), model_parallel True, dtype torch.float32 +[01:45:45.840024] Param llma.layers.27.attention.wo.weight: requires_grad False, local_size torch.Size([8192, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.840037] Param llma.layers.27.attention.wo.bias: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.840051] Param llma.layers.27.feed_forward.w1.weight: requires_grad False, local_size torch.Size([28672, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.840072] Param llma.layers.27.feed_forward.w1.bias: requires_grad True, local_size torch.Size([28672]), model_parallel True, dtype torch.float32 +[01:45:45.840085] Param llma.layers.27.feed_forward.w2.weight: requires_grad False, local_size torch.Size([8192, 28672]), model_parallel True, dtype torch.bfloat16 +[01:45:45.840096] Param llma.layers.27.feed_forward.w2.bias: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.840109] Param llma.layers.27.feed_forward.w3.weight: requires_grad False, local_size torch.Size([28672, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.840121] Param llma.layers.27.feed_forward.w3.bias: requires_grad True, local_size torch.Size([28672]), model_parallel True, dtype torch.float32 +[01:45:45.840134] Param llma.layers.27.attention_norm.weight: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.840147] Param llma.layers.27.ffn_norm.weight: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.840162] Param llma.layers.28.attention.wq.weight: requires_grad False, local_size torch.Size([8192, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.840174] Param llma.layers.28.attention.wq.bias: requires_grad True, local_size torch.Size([8192]), model_parallel True, dtype torch.float32 +[01:45:45.840188] Param llma.layers.28.attention.wk.weight: requires_grad False, local_size torch.Size([1024, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.840199] Param llma.layers.28.attention.wk.bias: requires_grad True, local_size torch.Size([1024]), model_parallel True, dtype torch.float32 +[01:45:45.840213] Param llma.layers.28.attention.wv.weight: requires_grad False, local_size torch.Size([1024, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.840225] Param llma.layers.28.attention.wv.bias: requires_grad True, local_size torch.Size([1024]), model_parallel True, dtype torch.float32 +[01:45:45.840238] Param llma.layers.28.attention.wo.weight: requires_grad False, local_size torch.Size([8192, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.840249] Param llma.layers.28.attention.wo.bias: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.840263] Param llma.layers.28.feed_forward.w1.weight: requires_grad False, local_size torch.Size([28672, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.840275] Param llma.layers.28.feed_forward.w1.bias: requires_grad True, local_size torch.Size([28672]), model_parallel True, dtype torch.float32 +[01:45:45.840288] Param llma.layers.28.feed_forward.w2.weight: requires_grad False, local_size torch.Size([8192, 28672]), model_parallel True, dtype torch.bfloat16 +[01:45:45.840300] Param llma.layers.28.feed_forward.w2.bias: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.840313] Param llma.layers.28.feed_forward.w3.weight: requires_grad False, local_size torch.Size([28672, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.840325] Param llma.layers.28.feed_forward.w3.bias: requires_grad True, local_size torch.Size([28672]), model_parallel True, dtype torch.float32 +[01:45:45.840338] Param llma.layers.28.attention_norm.weight: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.840351] Param llma.layers.28.ffn_norm.weight: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.840366] Param llma.layers.29.attention.wq.weight: requires_grad False, local_size torch.Size([8192, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.840378] Param llma.layers.29.attention.wq.bias: requires_grad True, local_size torch.Size([8192]), model_parallel True, dtype torch.float32 +[01:45:45.840392] Param llma.layers.29.attention.wk.weight: requires_grad False, local_size torch.Size([1024, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.840403] Param llma.layers.29.attention.wk.bias: requires_grad True, local_size torch.Size([1024]), model_parallel True, dtype torch.float32 +[01:45:45.840416] Param llma.layers.29.attention.wv.weight: requires_grad False, local_size torch.Size([1024, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.840428] Param llma.layers.29.attention.wv.bias: requires_grad True, local_size torch.Size([1024]), model_parallel True, dtype torch.float32 +[01:45:45.840441] Param llma.layers.29.attention.wo.weight: requires_grad False, local_size torch.Size([8192, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.840453] Param llma.layers.29.attention.wo.bias: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.840467] Param llma.layers.29.feed_forward.w1.weight: requires_grad False, local_size torch.Size([28672, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.840479] Param llma.layers.29.feed_forward.w1.bias: requires_grad True, local_size torch.Size([28672]), model_parallel True, dtype torch.float32 +[01:45:45.840492] Param llma.layers.29.feed_forward.w2.weight: requires_grad False, local_size torch.Size([8192, 28672]), model_parallel True, dtype torch.bfloat16 +[01:45:45.840504] Param llma.layers.29.feed_forward.w2.bias: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.840517] Param llma.layers.29.feed_forward.w3.weight: requires_grad False, local_size torch.Size([28672, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.840528] Param llma.layers.29.feed_forward.w3.bias: requires_grad True, local_size torch.Size([28672]), model_parallel True, dtype torch.float32 +[01:45:45.840541] Param llma.layers.29.attention_norm.weight: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.840554] Param llma.layers.29.ffn_norm.weight: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.840569] Param llma.layers.30.attention.wq.weight: requires_grad False, local_size torch.Size([8192, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.840581] Param llma.layers.30.attention.wq.bias: requires_grad True, local_size torch.Size([8192]), model_parallel True, dtype torch.float32 +[01:45:45.840594] Param llma.layers.30.attention.wk.weight: requires_grad False, local_size torch.Size([1024, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.840606] Param llma.layers.30.attention.wk.bias: requires_grad True, local_size torch.Size([1024]), model_parallel True, dtype torch.float32 +[01:45:45.840619] Param llma.layers.30.attention.wv.weight: requires_grad False, local_size torch.Size([1024, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.840631] Param llma.layers.30.attention.wv.bias: requires_grad True, local_size torch.Size([1024]), model_parallel True, dtype torch.float32 +[01:45:45.840643] Param llma.layers.30.attention.wo.weight: requires_grad False, local_size torch.Size([8192, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.840655] Param llma.layers.30.attention.wo.bias: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.840669] Param llma.layers.30.feed_forward.w1.weight: requires_grad False, local_size torch.Size([28672, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.840681] Param llma.layers.30.feed_forward.w1.bias: requires_grad True, local_size torch.Size([28672]), model_parallel True, dtype torch.float32 +[01:45:45.840694] Param llma.layers.30.feed_forward.w2.weight: requires_grad False, local_size torch.Size([8192, 28672]), model_parallel True, dtype torch.bfloat16 +[01:45:45.840706] Param llma.layers.30.feed_forward.w2.bias: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.840719] Param llma.layers.30.feed_forward.w3.weight: requires_grad False, local_size torch.Size([28672, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.840730] Param llma.layers.30.feed_forward.w3.bias: requires_grad True, local_size torch.Size([28672]), model_parallel True, dtype torch.float32 +[01:45:45.840743] Param llma.layers.30.attention_norm.weight: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.840756] Param llma.layers.30.ffn_norm.weight: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.840771] Param llma.layers.31.attention.wq.weight: requires_grad False, local_size torch.Size([8192, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.840783] Param llma.layers.31.attention.wq.bias: requires_grad True, local_size torch.Size([8192]), model_parallel True, dtype torch.float32 +[01:45:45.840797] Param llma.layers.31.attention.wk.weight: requires_grad False, local_size torch.Size([1024, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.840808] Param llma.layers.31.attention.wk.bias: requires_grad True, local_size torch.Size([1024]), model_parallel True, dtype torch.float32 +[01:45:45.840821] Param llma.layers.31.attention.wv.weight: requires_grad False, local_size torch.Size([1024, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.840833] Param llma.layers.31.attention.wv.bias: requires_grad True, local_size torch.Size([1024]), model_parallel True, dtype torch.float32 +[01:45:45.840846] Param llma.layers.31.attention.wo.weight: requires_grad False, local_size torch.Size([8192, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.840858] Param llma.layers.31.attention.wo.bias: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.840873] Param llma.layers.31.feed_forward.w1.weight: requires_grad False, local_size torch.Size([28672, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.840884] Param llma.layers.31.feed_forward.w1.bias: requires_grad True, local_size torch.Size([28672]), model_parallel True, dtype torch.float32 +[01:45:45.840897] Param llma.layers.31.feed_forward.w2.weight: requires_grad False, local_size torch.Size([8192, 28672]), model_parallel True, dtype torch.bfloat16 +[01:45:45.840909] Param llma.layers.31.feed_forward.w2.bias: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.840922] Param llma.layers.31.feed_forward.w3.weight: requires_grad False, local_size torch.Size([28672, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.840933] Param llma.layers.31.feed_forward.w3.bias: requires_grad True, local_size torch.Size([28672]), model_parallel True, dtype torch.float32 +[01:45:45.840946] Param llma.layers.31.attention_norm.weight: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.840959] Param llma.layers.31.ffn_norm.weight: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.840974] Param llma.layers.32.attention.wq.weight: requires_grad False, local_size torch.Size([8192, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.840986] Param llma.layers.32.attention.wq.bias: requires_grad True, local_size torch.Size([8192]), model_parallel True, dtype torch.float32 +[01:45:45.840999] Param llma.layers.32.attention.wk.weight: requires_grad False, local_size torch.Size([1024, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.841011] Param llma.layers.32.attention.wk.bias: requires_grad True, local_size torch.Size([1024]), model_parallel True, dtype torch.float32 +[01:45:45.841024] Param llma.layers.32.attention.wv.weight: requires_grad False, local_size torch.Size([1024, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.841036] Param llma.layers.32.attention.wv.bias: requires_grad True, local_size torch.Size([1024]), model_parallel True, dtype torch.float32 +[01:45:45.841049] Param llma.layers.32.attention.wo.weight: requires_grad False, local_size torch.Size([8192, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.841068] Param llma.layers.32.attention.wo.bias: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.841083] Param llma.layers.32.feed_forward.w1.weight: requires_grad False, local_size torch.Size([28672, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.841094] Param llma.layers.32.feed_forward.w1.bias: requires_grad True, local_size torch.Size([28672]), model_parallel True, dtype torch.float32 +[01:45:45.841107] Param llma.layers.32.feed_forward.w2.weight: requires_grad False, local_size torch.Size([8192, 28672]), model_parallel True, dtype torch.bfloat16 +[01:45:45.841119] Param llma.layers.32.feed_forward.w2.bias: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.841133] Param llma.layers.32.feed_forward.w3.weight: requires_grad False, local_size torch.Size([28672, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.841144] Param llma.layers.32.feed_forward.w3.bias: requires_grad True, local_size torch.Size([28672]), model_parallel True, dtype torch.float32 +[01:45:45.841157] Param llma.layers.32.attention_norm.weight: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.841170] Param llma.layers.32.ffn_norm.weight: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.841185] Param llma.layers.33.attention.wq.weight: requires_grad False, local_size torch.Size([8192, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.841197] Param llma.layers.33.attention.wq.bias: requires_grad True, local_size torch.Size([8192]), model_parallel True, dtype torch.float32 +[01:45:45.841210] Param llma.layers.33.attention.wk.weight: requires_grad False, local_size torch.Size([1024, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.841221] Param llma.layers.33.attention.wk.bias: requires_grad True, local_size torch.Size([1024]), model_parallel True, dtype torch.float32 +[01:45:45.841235] Param llma.layers.33.attention.wv.weight: requires_grad False, local_size torch.Size([1024, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.841247] Param llma.layers.33.attention.wv.bias: requires_grad True, local_size torch.Size([1024]), model_parallel True, dtype torch.float32 +[01:45:45.841260] Param llma.layers.33.attention.wo.weight: requires_grad False, local_size torch.Size([8192, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.841271] Param llma.layers.33.attention.wo.bias: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.841285] Param llma.layers.33.feed_forward.w1.weight: requires_grad False, local_size torch.Size([28672, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.841297] Param llma.layers.33.feed_forward.w1.bias: requires_grad True, local_size torch.Size([28672]), model_parallel True, dtype torch.float32 +[01:45:45.841310] Param llma.layers.33.feed_forward.w2.weight: requires_grad False, local_size torch.Size([8192, 28672]), model_parallel True, dtype torch.bfloat16 +[01:45:45.841322] Param llma.layers.33.feed_forward.w2.bias: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.841335] Param llma.layers.33.feed_forward.w3.weight: requires_grad False, local_size torch.Size([28672, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.841347] Param llma.layers.33.feed_forward.w3.bias: requires_grad True, local_size torch.Size([28672]), model_parallel True, dtype torch.float32 +[01:45:45.841360] Param llma.layers.33.attention_norm.weight: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.841373] Param llma.layers.33.ffn_norm.weight: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.841388] Param llma.layers.34.attention.wq.weight: requires_grad False, local_size torch.Size([8192, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.841400] Param llma.layers.34.attention.wq.bias: requires_grad True, local_size torch.Size([8192]), model_parallel True, dtype torch.float32 +[01:45:45.841413] Param llma.layers.34.attention.wk.weight: requires_grad False, local_size torch.Size([1024, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.841425] Param llma.layers.34.attention.wk.bias: requires_grad True, local_size torch.Size([1024]), model_parallel True, dtype torch.float32 +[01:45:45.841438] Param llma.layers.34.attention.wv.weight: requires_grad False, local_size torch.Size([1024, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.841450] Param llma.layers.34.attention.wv.bias: requires_grad True, local_size torch.Size([1024]), model_parallel True, dtype torch.float32 +[01:45:45.841463] Param llma.layers.34.attention.wo.weight: requires_grad False, local_size torch.Size([8192, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.841475] Param llma.layers.34.attention.wo.bias: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.841489] Param llma.layers.34.feed_forward.w1.weight: requires_grad False, local_size torch.Size([28672, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.841500] Param llma.layers.34.feed_forward.w1.bias: requires_grad True, local_size torch.Size([28672]), model_parallel True, dtype torch.float32 +[01:45:45.841513] Param llma.layers.34.feed_forward.w2.weight: requires_grad False, local_size torch.Size([8192, 28672]), model_parallel True, dtype torch.bfloat16 +[01:45:45.841525] Param llma.layers.34.feed_forward.w2.bias: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.841538] Param llma.layers.34.feed_forward.w3.weight: requires_grad False, local_size torch.Size([28672, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.841550] Param llma.layers.34.feed_forward.w3.bias: requires_grad True, local_size torch.Size([28672]), model_parallel True, dtype torch.float32 +[01:45:45.841563] Param llma.layers.34.attention_norm.weight: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.841576] Param llma.layers.34.ffn_norm.weight: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.841591] Param llma.layers.35.attention.wq.weight: requires_grad False, local_size torch.Size([8192, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.841603] Param llma.layers.35.attention.wq.bias: requires_grad True, local_size torch.Size([8192]), model_parallel True, dtype torch.float32 +[01:45:45.841617] Param llma.layers.35.attention.wk.weight: requires_grad False, local_size torch.Size([1024, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.841628] Param llma.layers.35.attention.wk.bias: requires_grad True, local_size torch.Size([1024]), model_parallel True, dtype torch.float32 +[01:45:45.841641] Param llma.layers.35.attention.wv.weight: requires_grad False, local_size torch.Size([1024, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.841653] Param llma.layers.35.attention.wv.bias: requires_grad True, local_size torch.Size([1024]), model_parallel True, dtype torch.float32 +[01:45:45.841666] Param llma.layers.35.attention.wo.weight: requires_grad False, local_size torch.Size([8192, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.841678] Param llma.layers.35.attention.wo.bias: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.841692] Param llma.layers.35.feed_forward.w1.weight: requires_grad False, local_size torch.Size([28672, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.841704] Param llma.layers.35.feed_forward.w1.bias: requires_grad True, local_size torch.Size([28672]), model_parallel True, dtype torch.float32 +[01:45:45.841716] Param llma.layers.35.feed_forward.w2.weight: requires_grad False, local_size torch.Size([8192, 28672]), model_parallel True, dtype torch.bfloat16 +[01:45:45.841728] Param llma.layers.35.feed_forward.w2.bias: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.841741] Param llma.layers.35.feed_forward.w3.weight: requires_grad False, local_size torch.Size([28672, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.841753] Param llma.layers.35.feed_forward.w3.bias: requires_grad True, local_size torch.Size([28672]), model_parallel True, dtype torch.float32 +[01:45:45.841766] Param llma.layers.35.attention_norm.weight: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.841779] Param llma.layers.35.ffn_norm.weight: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.841794] Param llma.layers.36.attention.wq.weight: requires_grad False, local_size torch.Size([8192, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.841806] Param llma.layers.36.attention.wq.bias: requires_grad True, local_size torch.Size([8192]), model_parallel True, dtype torch.float32 +[01:45:45.841820] Param llma.layers.36.attention.wk.weight: requires_grad False, local_size torch.Size([1024, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.841831] Param llma.layers.36.attention.wk.bias: requires_grad True, local_size torch.Size([1024]), model_parallel True, dtype torch.float32 +[01:45:45.841844] Param llma.layers.36.attention.wv.weight: requires_grad False, local_size torch.Size([1024, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.841856] Param llma.layers.36.attention.wv.bias: requires_grad True, local_size torch.Size([1024]), model_parallel True, dtype torch.float32 +[01:45:45.841869] Param llma.layers.36.attention.wo.weight: requires_grad False, local_size torch.Size([8192, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.841880] Param llma.layers.36.attention.wo.bias: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.841894] Param llma.layers.36.feed_forward.w1.weight: requires_grad False, local_size torch.Size([28672, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.841906] Param llma.layers.36.feed_forward.w1.bias: requires_grad True, local_size torch.Size([28672]), model_parallel True, dtype torch.float32 +[01:45:45.841919] Param llma.layers.36.feed_forward.w2.weight: requires_grad False, local_size torch.Size([8192, 28672]), model_parallel True, dtype torch.bfloat16 +[01:45:45.841931] Param llma.layers.36.feed_forward.w2.bias: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.841944] Param llma.layers.36.feed_forward.w3.weight: requires_grad False, local_size torch.Size([28672, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.841955] Param llma.layers.36.feed_forward.w3.bias: requires_grad True, local_size torch.Size([28672]), model_parallel True, dtype torch.float32 +[01:45:45.841968] Param llma.layers.36.attention_norm.weight: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.841981] Param llma.layers.36.ffn_norm.weight: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.841996] Param llma.layers.37.attention.wq.weight: requires_grad False, local_size torch.Size([8192, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.842008] Param llma.layers.37.attention.wq.bias: requires_grad True, local_size torch.Size([8192]), model_parallel True, dtype torch.float32 +[01:45:45.842022] Param llma.layers.37.attention.wk.weight: requires_grad False, local_size torch.Size([1024, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.842033] Param llma.layers.37.attention.wk.bias: requires_grad True, local_size torch.Size([1024]), model_parallel True, dtype torch.float32 +[01:45:45.842047] Param llma.layers.37.attention.wv.weight: requires_grad False, local_size torch.Size([1024, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.842066] Param llma.layers.37.attention.wv.bias: requires_grad True, local_size torch.Size([1024]), model_parallel True, dtype torch.float32 +[01:45:45.842079] Param llma.layers.37.attention.wo.weight: requires_grad False, local_size torch.Size([8192, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.842091] Param llma.layers.37.attention.wo.bias: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.842105] Param llma.layers.37.feed_forward.w1.weight: requires_grad False, local_size torch.Size([28672, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.842117] Param llma.layers.37.feed_forward.w1.bias: requires_grad True, local_size torch.Size([28672]), model_parallel True, dtype torch.float32 +[01:45:45.842131] Param llma.layers.37.feed_forward.w2.weight: requires_grad False, local_size torch.Size([8192, 28672]), model_parallel True, dtype torch.bfloat16 +[01:45:45.842143] Param llma.layers.37.feed_forward.w2.bias: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.842156] Param llma.layers.37.feed_forward.w3.weight: requires_grad False, local_size torch.Size([28672, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.842168] Param llma.layers.37.feed_forward.w3.bias: requires_grad True, local_size torch.Size([28672]), model_parallel True, dtype torch.float32 +[01:45:45.842181] Param llma.layers.37.attention_norm.weight: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.842194] Param llma.layers.37.ffn_norm.weight: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.842209] Param llma.layers.38.attention.wq.weight: requires_grad False, local_size torch.Size([8192, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.842221] Param llma.layers.38.attention.wq.bias: requires_grad True, local_size torch.Size([8192]), model_parallel True, dtype torch.float32 +[01:45:45.842234] Param llma.layers.38.attention.wk.weight: requires_grad False, local_size torch.Size([1024, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.842246] Param llma.layers.38.attention.wk.bias: requires_grad True, local_size torch.Size([1024]), model_parallel True, dtype torch.float32 +[01:45:45.842260] Param llma.layers.38.attention.wv.weight: requires_grad False, local_size torch.Size([1024, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.842272] Param llma.layers.38.attention.wv.bias: requires_grad True, local_size torch.Size([1024]), model_parallel True, dtype torch.float32 +[01:45:45.842285] Param llma.layers.38.attention.wo.weight: requires_grad False, local_size torch.Size([8192, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.842296] Param llma.layers.38.attention.wo.bias: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.842310] Param llma.layers.38.feed_forward.w1.weight: requires_grad False, local_size torch.Size([28672, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.842322] Param llma.layers.38.feed_forward.w1.bias: requires_grad True, local_size torch.Size([28672]), model_parallel True, dtype torch.float32 +[01:45:45.842335] Param llma.layers.38.feed_forward.w2.weight: requires_grad False, local_size torch.Size([8192, 28672]), model_parallel True, dtype torch.bfloat16 +[01:45:45.842347] Param llma.layers.38.feed_forward.w2.bias: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.842360] Param llma.layers.38.feed_forward.w3.weight: requires_grad False, local_size torch.Size([28672, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.842372] Param llma.layers.38.feed_forward.w3.bias: requires_grad True, local_size torch.Size([28672]), model_parallel True, dtype torch.float32 +[01:45:45.842385] Param llma.layers.38.attention_norm.weight: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.842398] Param llma.layers.38.ffn_norm.weight: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.842413] Param llma.layers.39.attention.wq.weight: requires_grad False, local_size torch.Size([8192, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.842425] Param llma.layers.39.attention.wq.bias: requires_grad True, local_size torch.Size([8192]), model_parallel True, dtype torch.float32 +[01:45:45.842438] Param llma.layers.39.attention.wk.weight: requires_grad False, local_size torch.Size([1024, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.842450] Param llma.layers.39.attention.wk.bias: requires_grad True, local_size torch.Size([1024]), model_parallel True, dtype torch.float32 +[01:45:45.842463] Param llma.layers.39.attention.wv.weight: requires_grad False, local_size torch.Size([1024, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.842475] Param llma.layers.39.attention.wv.bias: requires_grad True, local_size torch.Size([1024]), model_parallel True, dtype torch.float32 +[01:45:45.842488] Param llma.layers.39.attention.wo.weight: requires_grad False, local_size torch.Size([8192, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.842500] Param llma.layers.39.attention.wo.bias: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.842514] Param llma.layers.39.feed_forward.w1.weight: requires_grad False, local_size torch.Size([28672, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.842526] Param llma.layers.39.feed_forward.w1.bias: requires_grad True, local_size torch.Size([28672]), model_parallel True, dtype torch.float32 +[01:45:45.842539] Param llma.layers.39.feed_forward.w2.weight: requires_grad False, local_size torch.Size([8192, 28672]), model_parallel True, dtype torch.bfloat16 +[01:45:45.842552] Param llma.layers.39.feed_forward.w2.bias: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.842565] Param llma.layers.39.feed_forward.w3.weight: requires_grad False, local_size torch.Size([28672, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.842577] Param llma.layers.39.feed_forward.w3.bias: requires_grad True, local_size torch.Size([28672]), model_parallel True, dtype torch.float32 +[01:45:45.842590] Param llma.layers.39.attention_norm.weight: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.842603] Param llma.layers.39.ffn_norm.weight: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.842619] Param llma.layers.40.attention.wq.weight: requires_grad False, local_size torch.Size([8192, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.842631] Param llma.layers.40.attention.wq.bias: requires_grad True, local_size torch.Size([8192]), model_parallel True, dtype torch.float32 +[01:45:45.842644] Param llma.layers.40.attention.wk.weight: requires_grad False, local_size torch.Size([1024, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.842656] Param llma.layers.40.attention.wk.bias: requires_grad True, local_size torch.Size([1024]), model_parallel True, dtype torch.float32 +[01:45:45.842669] Param llma.layers.40.attention.wv.weight: requires_grad False, local_size torch.Size([1024, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.842680] Param llma.layers.40.attention.wv.bias: requires_grad True, local_size torch.Size([1024]), model_parallel True, dtype torch.float32 +[01:45:45.842693] Param llma.layers.40.attention.wo.weight: requires_grad False, local_size torch.Size([8192, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.842714] Param llma.layers.40.attention.wo.bias: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.842729] Param llma.layers.40.feed_forward.w1.weight: requires_grad False, local_size torch.Size([28672, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.842741] Param llma.layers.40.feed_forward.w1.bias: requires_grad True, local_size torch.Size([28672]), model_parallel True, dtype torch.float32 +[01:45:45.842754] Param llma.layers.40.feed_forward.w2.weight: requires_grad False, local_size torch.Size([8192, 28672]), model_parallel True, dtype torch.bfloat16 +[01:45:45.842766] Param llma.layers.40.feed_forward.w2.bias: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.842779] Param llma.layers.40.feed_forward.w3.weight: requires_grad False, local_size torch.Size([28672, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.842790] Param llma.layers.40.feed_forward.w3.bias: requires_grad True, local_size torch.Size([28672]), model_parallel True, dtype torch.float32 +[01:45:45.842804] Param llma.layers.40.attention_norm.weight: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.842817] Param llma.layers.40.ffn_norm.weight: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.842832] Param llma.layers.41.attention.wq.weight: requires_grad False, local_size torch.Size([8192, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.842844] Param llma.layers.41.attention.wq.bias: requires_grad True, local_size torch.Size([8192]), model_parallel True, dtype torch.float32 +[01:45:45.842858] Param llma.layers.41.attention.wk.weight: requires_grad False, local_size torch.Size([1024, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.842870] Param llma.layers.41.attention.wk.bias: requires_grad True, local_size torch.Size([1024]), model_parallel True, dtype torch.float32 +[01:45:45.842883] Param llma.layers.41.attention.wv.weight: requires_grad False, local_size torch.Size([1024, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.842895] Param llma.layers.41.attention.wv.bias: requires_grad True, local_size torch.Size([1024]), model_parallel True, dtype torch.float32 +[01:45:45.842908] Param llma.layers.41.attention.wo.weight: requires_grad False, local_size torch.Size([8192, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.842920] Param llma.layers.41.attention.wo.bias: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.842934] Param llma.layers.41.feed_forward.w1.weight: requires_grad False, local_size torch.Size([28672, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.842946] Param llma.layers.41.feed_forward.w1.bias: requires_grad True, local_size torch.Size([28672]), model_parallel True, dtype torch.float32 +[01:45:45.842960] Param llma.layers.41.feed_forward.w2.weight: requires_grad False, local_size torch.Size([8192, 28672]), model_parallel True, dtype torch.bfloat16 +[01:45:45.842972] Param llma.layers.41.feed_forward.w2.bias: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.842985] Param llma.layers.41.feed_forward.w3.weight: requires_grad False, local_size torch.Size([28672, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.842996] Param llma.layers.41.feed_forward.w3.bias: requires_grad True, local_size torch.Size([28672]), model_parallel True, dtype torch.float32 +[01:45:45.843010] Param llma.layers.41.attention_norm.weight: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.843023] Param llma.layers.41.ffn_norm.weight: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.843038] Param llma.layers.42.attention.wq.weight: requires_grad False, local_size torch.Size([8192, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.843050] Param llma.layers.42.attention.wq.bias: requires_grad True, local_size torch.Size([8192]), model_parallel True, dtype torch.float32 +[01:45:45.843072] Param llma.layers.42.attention.wk.weight: requires_grad False, local_size torch.Size([1024, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.843084] Param llma.layers.42.attention.wk.bias: requires_grad True, local_size torch.Size([1024]), model_parallel True, dtype torch.float32 +[01:45:45.843098] Param llma.layers.42.attention.wv.weight: requires_grad False, local_size torch.Size([1024, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.843109] Param llma.layers.42.attention.wv.bias: requires_grad True, local_size torch.Size([1024]), model_parallel True, dtype torch.float32 +[01:45:45.843123] Param llma.layers.42.attention.wo.weight: requires_grad False, local_size torch.Size([8192, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.843134] Param llma.layers.42.attention.wo.bias: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.843148] Param llma.layers.42.feed_forward.w1.weight: requires_grad False, local_size torch.Size([28672, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.843160] Param llma.layers.42.feed_forward.w1.bias: requires_grad True, local_size torch.Size([28672]), model_parallel True, dtype torch.float32 +[01:45:45.843173] Param llma.layers.42.feed_forward.w2.weight: requires_grad False, local_size torch.Size([8192, 28672]), model_parallel True, dtype torch.bfloat16 +[01:45:45.843185] Param llma.layers.42.feed_forward.w2.bias: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.843198] Param llma.layers.42.feed_forward.w3.weight: requires_grad False, local_size torch.Size([28672, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.843210] Param llma.layers.42.feed_forward.w3.bias: requires_grad True, local_size torch.Size([28672]), model_parallel True, dtype torch.float32 +[01:45:45.843223] Param llma.layers.42.attention_norm.weight: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.843236] Param llma.layers.42.ffn_norm.weight: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.843252] Param llma.layers.43.attention.wq.weight: requires_grad False, local_size torch.Size([8192, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.843264] Param llma.layers.43.attention.wq.bias: requires_grad True, local_size torch.Size([8192]), model_parallel True, dtype torch.float32 +[01:45:45.843277] Param llma.layers.43.attention.wk.weight: requires_grad False, local_size torch.Size([1024, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.843289] Param llma.layers.43.attention.wk.bias: requires_grad True, local_size torch.Size([1024]), model_parallel True, dtype torch.float32 +[01:45:45.843302] Param llma.layers.43.attention.wv.weight: requires_grad False, local_size torch.Size([1024, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.843315] Param llma.layers.43.attention.wv.bias: requires_grad True, local_size torch.Size([1024]), model_parallel True, dtype torch.float32 +[01:45:45.843327] Param llma.layers.43.attention.wo.weight: requires_grad False, local_size torch.Size([8192, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.843339] Param llma.layers.43.attention.wo.bias: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.843354] Param llma.layers.43.feed_forward.w1.weight: requires_grad False, local_size torch.Size([28672, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.843365] Param llma.layers.43.feed_forward.w1.bias: requires_grad True, local_size torch.Size([28672]), model_parallel True, dtype torch.float32 +[01:45:45.843379] Param llma.layers.43.feed_forward.w2.weight: requires_grad False, local_size torch.Size([8192, 28672]), model_parallel True, dtype torch.bfloat16 +[01:45:45.843391] Param llma.layers.43.feed_forward.w2.bias: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.843404] Param llma.layers.43.feed_forward.w3.weight: requires_grad False, local_size torch.Size([28672, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.843416] Param llma.layers.43.feed_forward.w3.bias: requires_grad True, local_size torch.Size([28672]), model_parallel True, dtype torch.float32 +[01:45:45.843429] Param llma.layers.43.attention_norm.weight: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.843442] Param llma.layers.43.ffn_norm.weight: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.843457] Param llma.layers.44.attention.wq.weight: requires_grad False, local_size torch.Size([8192, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.843468] Param llma.layers.44.attention.wq.bias: requires_grad True, local_size torch.Size([8192]), model_parallel True, dtype torch.float32 +[01:45:45.843482] Param llma.layers.44.attention.wk.weight: requires_grad False, local_size torch.Size([1024, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.843494] Param llma.layers.44.attention.wk.bias: requires_grad True, local_size torch.Size([1024]), model_parallel True, dtype torch.float32 +[01:45:45.843507] Param llma.layers.44.attention.wv.weight: requires_grad False, local_size torch.Size([1024, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.843519] Param llma.layers.44.attention.wv.bias: requires_grad True, local_size torch.Size([1024]), model_parallel True, dtype torch.float32 +[01:45:45.843532] Param llma.layers.44.attention.wo.weight: requires_grad False, local_size torch.Size([8192, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.843544] Param llma.layers.44.attention.wo.bias: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.843557] Param llma.layers.44.feed_forward.w1.weight: requires_grad False, local_size torch.Size([28672, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.843569] Param llma.layers.44.feed_forward.w1.bias: requires_grad True, local_size torch.Size([28672]), model_parallel True, dtype torch.float32 +[01:45:45.843582] Param llma.layers.44.feed_forward.w2.weight: requires_grad False, local_size torch.Size([8192, 28672]), model_parallel True, dtype torch.bfloat16 +[01:45:45.843594] Param llma.layers.44.feed_forward.w2.bias: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.843607] Param llma.layers.44.feed_forward.w3.weight: requires_grad False, local_size torch.Size([28672, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.843619] Param llma.layers.44.feed_forward.w3.bias: requires_grad True, local_size torch.Size([28672]), model_parallel True, dtype torch.float32 +[01:45:45.843632] Param llma.layers.44.attention_norm.weight: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.843644] Param llma.layers.44.ffn_norm.weight: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.843659] Param llma.layers.45.attention.wq.weight: requires_grad False, local_size torch.Size([8192, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.843672] Param llma.layers.45.attention.wq.bias: requires_grad True, local_size torch.Size([8192]), model_parallel True, dtype torch.float32 +[01:45:45.843685] Param llma.layers.45.attention.wk.weight: requires_grad False, local_size torch.Size([1024, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.843697] Param llma.layers.45.attention.wk.bias: requires_grad True, local_size torch.Size([1024]), model_parallel True, dtype torch.float32 +[01:45:45.843710] Param llma.layers.45.attention.wv.weight: requires_grad False, local_size torch.Size([1024, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.843722] Param llma.layers.45.attention.wv.bias: requires_grad True, local_size torch.Size([1024]), model_parallel True, dtype torch.float32 +[01:45:45.843735] Param llma.layers.45.attention.wo.weight: requires_grad False, local_size torch.Size([8192, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.843747] Param llma.layers.45.attention.wo.bias: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.843761] Param llma.layers.45.feed_forward.w1.weight: requires_grad False, local_size torch.Size([28672, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.843772] Param llma.layers.45.feed_forward.w1.bias: requires_grad True, local_size torch.Size([28672]), model_parallel True, dtype torch.float32 +[01:45:45.843785] Param llma.layers.45.feed_forward.w2.weight: requires_grad False, local_size torch.Size([8192, 28672]), model_parallel True, dtype torch.bfloat16 +[01:45:45.843797] Param llma.layers.45.feed_forward.w2.bias: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.843810] Param llma.layers.45.feed_forward.w3.weight: requires_grad False, local_size torch.Size([28672, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.843822] Param llma.layers.45.feed_forward.w3.bias: requires_grad True, local_size torch.Size([28672]), model_parallel True, dtype torch.float32 +[01:45:45.843835] Param llma.layers.45.attention_norm.weight: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.843848] Param llma.layers.45.ffn_norm.weight: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.843863] Param llma.layers.46.attention.wq.weight: requires_grad False, local_size torch.Size([8192, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.843875] Param llma.layers.46.attention.wq.bias: requires_grad True, local_size torch.Size([8192]), model_parallel True, dtype torch.float32 +[01:45:45.843888] Param llma.layers.46.attention.wk.weight: requires_grad False, local_size torch.Size([1024, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.843900] Param llma.layers.46.attention.wk.bias: requires_grad True, local_size torch.Size([1024]), model_parallel True, dtype torch.float32 +[01:45:45.843913] Param llma.layers.46.attention.wv.weight: requires_grad False, local_size torch.Size([1024, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.843925] Param llma.layers.46.attention.wv.bias: requires_grad True, local_size torch.Size([1024]), model_parallel True, dtype torch.float32 +[01:45:45.843937] Param llma.layers.46.attention.wo.weight: requires_grad False, local_size torch.Size([8192, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.843949] Param llma.layers.46.attention.wo.bias: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.843963] Param llma.layers.46.feed_forward.w1.weight: requires_grad False, local_size torch.Size([28672, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.843975] Param llma.layers.46.feed_forward.w1.bias: requires_grad True, local_size torch.Size([28672]), model_parallel True, dtype torch.float32 +[01:45:45.843988] Param llma.layers.46.feed_forward.w2.weight: requires_grad False, local_size torch.Size([8192, 28672]), model_parallel True, dtype torch.bfloat16 +[01:45:45.844000] Param llma.layers.46.feed_forward.w2.bias: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.844013] Param llma.layers.46.feed_forward.w3.weight: requires_grad False, local_size torch.Size([28672, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.844024] Param llma.layers.46.feed_forward.w3.bias: requires_grad True, local_size torch.Size([28672]), model_parallel True, dtype torch.float32 +[01:45:45.844038] Param llma.layers.46.attention_norm.weight: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.844051] Param llma.layers.46.ffn_norm.weight: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.844074] Param llma.layers.47.attention.wq.weight: requires_grad False, local_size torch.Size([8192, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.844086] Param llma.layers.47.attention.wq.bias: requires_grad True, local_size torch.Size([8192]), model_parallel True, dtype torch.float32 +[01:45:45.844100] Param llma.layers.47.attention.wk.weight: requires_grad False, local_size torch.Size([1024, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.844112] Param llma.layers.47.attention.wk.bias: requires_grad True, local_size torch.Size([1024]), model_parallel True, dtype torch.float32 +[01:45:45.844125] Param llma.layers.47.attention.wv.weight: requires_grad False, local_size torch.Size([1024, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.844137] Param llma.layers.47.attention.wv.bias: requires_grad True, local_size torch.Size([1024]), model_parallel True, dtype torch.float32 +[01:45:45.844150] Param llma.layers.47.attention.wo.weight: requires_grad False, local_size torch.Size([8192, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.844163] Param llma.layers.47.attention.wo.bias: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.844177] Param llma.layers.47.feed_forward.w1.weight: requires_grad False, local_size torch.Size([28672, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.844189] Param llma.layers.47.feed_forward.w1.bias: requires_grad True, local_size torch.Size([28672]), model_parallel True, dtype torch.float32 +[01:45:45.844202] Param llma.layers.47.feed_forward.w2.weight: requires_grad False, local_size torch.Size([8192, 28672]), model_parallel True, dtype torch.bfloat16 +[01:45:45.844214] Param llma.layers.47.feed_forward.w2.bias: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.844227] Param llma.layers.47.feed_forward.w3.weight: requires_grad False, local_size torch.Size([28672, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.844239] Param llma.layers.47.feed_forward.w3.bias: requires_grad True, local_size torch.Size([28672]), model_parallel True, dtype torch.float32 +[01:45:45.844252] Param llma.layers.47.attention_norm.weight: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.844266] Param llma.layers.47.ffn_norm.weight: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.844281] Param llma.layers.48.attention.wq.weight: requires_grad False, local_size torch.Size([8192, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.844292] Param llma.layers.48.attention.wq.bias: requires_grad True, local_size torch.Size([8192]), model_parallel True, dtype torch.float32 +[01:45:45.844306] Param llma.layers.48.attention.wk.weight: requires_grad False, local_size torch.Size([1024, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.844317] Param llma.layers.48.attention.wk.bias: requires_grad True, local_size torch.Size([1024]), model_parallel True, dtype torch.float32 +[01:45:45.844331] Param llma.layers.48.attention.wv.weight: requires_grad False, local_size torch.Size([1024, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.844342] Param llma.layers.48.attention.wv.bias: requires_grad True, local_size torch.Size([1024]), model_parallel True, dtype torch.float32 +[01:45:45.844355] Param llma.layers.48.attention.wo.weight: requires_grad False, local_size torch.Size([8192, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.844367] Param llma.layers.48.attention.wo.bias: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.844381] Param llma.layers.48.feed_forward.w1.weight: requires_grad False, local_size torch.Size([28672, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.844393] Param llma.layers.48.feed_forward.w1.bias: requires_grad True, local_size torch.Size([28672]), model_parallel True, dtype torch.float32 +[01:45:45.844406] Param llma.layers.48.feed_forward.w2.weight: requires_grad False, local_size torch.Size([8192, 28672]), model_parallel True, dtype torch.bfloat16 +[01:45:45.844418] Param llma.layers.48.feed_forward.w2.bias: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.844431] Param llma.layers.48.feed_forward.w3.weight: requires_grad False, local_size torch.Size([28672, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.844443] Param llma.layers.48.feed_forward.w3.bias: requires_grad True, local_size torch.Size([28672]), model_parallel True, dtype torch.float32 +[01:45:45.844456] Param llma.layers.48.attention_norm.weight: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.844469] Param llma.layers.48.ffn_norm.weight: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.844484] Param llma.layers.49.attention.wq.weight: requires_grad False, local_size torch.Size([8192, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.844496] Param llma.layers.49.attention.wq.bias: requires_grad True, local_size torch.Size([8192]), model_parallel True, dtype torch.float32 +[01:45:45.844509] Param llma.layers.49.attention.wk.weight: requires_grad False, local_size torch.Size([1024, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.844521] Param llma.layers.49.attention.wk.bias: requires_grad True, local_size torch.Size([1024]), model_parallel True, dtype torch.float32 +[01:45:45.844534] Param llma.layers.49.attention.wv.weight: requires_grad False, local_size torch.Size([1024, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.844546] Param llma.layers.49.attention.wv.bias: requires_grad True, local_size torch.Size([1024]), model_parallel True, dtype torch.float32 +[01:45:45.844559] Param llma.layers.49.attention.wo.weight: requires_grad False, local_size torch.Size([8192, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.844571] Param llma.layers.49.attention.wo.bias: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.844585] Param llma.layers.49.feed_forward.w1.weight: requires_grad False, local_size torch.Size([28672, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.844597] Param llma.layers.49.feed_forward.w1.bias: requires_grad True, local_size torch.Size([28672]), model_parallel True, dtype torch.float32 +[01:45:45.844610] Param llma.layers.49.feed_forward.w2.weight: requires_grad False, local_size torch.Size([8192, 28672]), model_parallel True, dtype torch.bfloat16 +[01:45:45.844622] Param llma.layers.49.feed_forward.w2.bias: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.844635] Param llma.layers.49.feed_forward.w3.weight: requires_grad False, local_size torch.Size([28672, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.844646] Param llma.layers.49.feed_forward.w3.bias: requires_grad True, local_size torch.Size([28672]), model_parallel True, dtype torch.float32 +[01:45:45.844659] Param llma.layers.49.attention_norm.weight: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.844673] Param llma.layers.49.ffn_norm.weight: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.844688] Param llma.layers.50.attention.wq.weight: requires_grad False, local_size torch.Size([8192, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.844700] Param llma.layers.50.attention.wq.bias: requires_grad True, local_size torch.Size([8192]), model_parallel True, dtype torch.float32 +[01:45:45.844713] Param llma.layers.50.attention.wk.weight: requires_grad False, local_size torch.Size([1024, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.844725] Param llma.layers.50.attention.wk.bias: requires_grad True, local_size torch.Size([1024]), model_parallel True, dtype torch.float32 +[01:45:45.844738] Param llma.layers.50.attention.wv.weight: requires_grad False, local_size torch.Size([1024, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.844750] Param llma.layers.50.attention.wv.bias: requires_grad True, local_size torch.Size([1024]), model_parallel True, dtype torch.float32 +[01:45:45.844763] Param llma.layers.50.attention.wo.weight: requires_grad False, local_size torch.Size([8192, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.844774] Param llma.layers.50.attention.wo.bias: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.844788] Param llma.layers.50.feed_forward.w1.weight: requires_grad False, local_size torch.Size([28672, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.844800] Param llma.layers.50.feed_forward.w1.bias: requires_grad True, local_size torch.Size([28672]), model_parallel True, dtype torch.float32 +[01:45:45.844813] Param llma.layers.50.feed_forward.w2.weight: requires_grad False, local_size torch.Size([8192, 28672]), model_parallel True, dtype torch.bfloat16 +[01:45:45.844825] Param llma.layers.50.feed_forward.w2.bias: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.844838] Param llma.layers.50.feed_forward.w3.weight: requires_grad False, local_size torch.Size([28672, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.844849] Param llma.layers.50.feed_forward.w3.bias: requires_grad True, local_size torch.Size([28672]), model_parallel True, dtype torch.float32 +[01:45:45.844862] Param llma.layers.50.attention_norm.weight: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.844875] Param llma.layers.50.ffn_norm.weight: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.844890] Param llma.layers.51.attention.wq.weight: requires_grad False, local_size torch.Size([8192, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.844902] Param llma.layers.51.attention.wq.bias: requires_grad True, local_size torch.Size([8192]), model_parallel True, dtype torch.float32 +[01:45:45.844916] Param llma.layers.51.attention.wk.weight: requires_grad False, local_size torch.Size([1024, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.844927] Param llma.layers.51.attention.wk.bias: requires_grad True, local_size torch.Size([1024]), model_parallel True, dtype torch.float32 +[01:45:45.844941] Param llma.layers.51.attention.wv.weight: requires_grad False, local_size torch.Size([1024, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.844952] Param llma.layers.51.attention.wv.bias: requires_grad True, local_size torch.Size([1024]), model_parallel True, dtype torch.float32 +[01:45:45.844965] Param llma.layers.51.attention.wo.weight: requires_grad False, local_size torch.Size([8192, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.844977] Param llma.layers.51.attention.wo.bias: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.844991] Param llma.layers.51.feed_forward.w1.weight: requires_grad False, local_size torch.Size([28672, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.845003] Param llma.layers.51.feed_forward.w1.bias: requires_grad True, local_size torch.Size([28672]), model_parallel True, dtype torch.float32 +[01:45:45.845016] Param llma.layers.51.feed_forward.w2.weight: requires_grad False, local_size torch.Size([8192, 28672]), model_parallel True, dtype torch.bfloat16 +[01:45:45.845028] Param llma.layers.51.feed_forward.w2.bias: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.845041] Param llma.layers.51.feed_forward.w3.weight: requires_grad False, local_size torch.Size([28672, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.845062] Param llma.layers.51.feed_forward.w3.bias: requires_grad True, local_size torch.Size([28672]), model_parallel True, dtype torch.float32 +[01:45:45.845075] Param llma.layers.51.attention_norm.weight: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.845089] Param llma.layers.51.ffn_norm.weight: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.845104] Param llma.layers.52.attention.wq.weight: requires_grad False, local_size torch.Size([8192, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.845116] Param llma.layers.52.attention.wq.bias: requires_grad True, local_size torch.Size([8192]), model_parallel True, dtype torch.float32 +[01:45:45.845129] Param llma.layers.52.attention.wk.weight: requires_grad False, local_size torch.Size([1024, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.845141] Param llma.layers.52.attention.wk.bias: requires_grad True, local_size torch.Size([1024]), model_parallel True, dtype torch.float32 +[01:45:45.845154] Param llma.layers.52.attention.wv.weight: requires_grad False, local_size torch.Size([1024, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.845166] Param llma.layers.52.attention.wv.bias: requires_grad True, local_size torch.Size([1024]), model_parallel True, dtype torch.float32 +[01:45:45.845179] Param llma.layers.52.attention.wo.weight: requires_grad False, local_size torch.Size([8192, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.845191] Param llma.layers.52.attention.wo.bias: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.845205] Param llma.layers.52.feed_forward.w1.weight: requires_grad False, local_size torch.Size([28672, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.845217] Param llma.layers.52.feed_forward.w1.bias: requires_grad True, local_size torch.Size([28672]), model_parallel True, dtype torch.float32 +[01:45:45.845230] Param llma.layers.52.feed_forward.w2.weight: requires_grad False, local_size torch.Size([8192, 28672]), model_parallel True, dtype torch.bfloat16 +[01:45:45.845242] Param llma.layers.52.feed_forward.w2.bias: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.845255] Param llma.layers.52.feed_forward.w3.weight: requires_grad False, local_size torch.Size([28672, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.845267] Param llma.layers.52.feed_forward.w3.bias: requires_grad True, local_size torch.Size([28672]), model_parallel True, dtype torch.float32 +[01:45:45.845280] Param llma.layers.52.attention_norm.weight: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.845293] Param llma.layers.52.ffn_norm.weight: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.845308] Param llma.layers.53.attention.wq.weight: requires_grad False, local_size torch.Size([8192, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.845320] Param llma.layers.53.attention.wq.bias: requires_grad True, local_size torch.Size([8192]), model_parallel True, dtype torch.float32 +[01:45:45.845334] Param llma.layers.53.attention.wk.weight: requires_grad False, local_size torch.Size([1024, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.845346] Param llma.layers.53.attention.wk.bias: requires_grad True, local_size torch.Size([1024]), model_parallel True, dtype torch.float32 +[01:45:45.845359] Param llma.layers.53.attention.wv.weight: requires_grad False, local_size torch.Size([1024, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.845371] Param llma.layers.53.attention.wv.bias: requires_grad True, local_size torch.Size([1024]), model_parallel True, dtype torch.float32 +[01:45:45.845384] Param llma.layers.53.attention.wo.weight: requires_grad False, local_size torch.Size([8192, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.845396] Param llma.layers.53.attention.wo.bias: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.845410] Param llma.layers.53.feed_forward.w1.weight: requires_grad False, local_size torch.Size([28672, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.845422] Param llma.layers.53.feed_forward.w1.bias: requires_grad True, local_size torch.Size([28672]), model_parallel True, dtype torch.float32 +[01:45:45.845435] Param llma.layers.53.feed_forward.w2.weight: requires_grad False, local_size torch.Size([8192, 28672]), model_parallel True, dtype torch.bfloat16 +[01:45:45.845447] Param llma.layers.53.feed_forward.w2.bias: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.845460] Param llma.layers.53.feed_forward.w3.weight: requires_grad False, local_size torch.Size([28672, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.845471] Param llma.layers.53.feed_forward.w3.bias: requires_grad True, local_size torch.Size([28672]), model_parallel True, dtype torch.float32 +[01:45:45.845485] Param llma.layers.53.attention_norm.weight: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.845498] Param llma.layers.53.ffn_norm.weight: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.845514] Param llma.layers.54.attention.wq.weight: requires_grad False, local_size torch.Size([8192, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.845526] Param llma.layers.54.attention.wq.bias: requires_grad True, local_size torch.Size([8192]), model_parallel True, dtype torch.float32 +[01:45:45.845539] Param llma.layers.54.attention.wk.weight: requires_grad False, local_size torch.Size([1024, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.845551] Param llma.layers.54.attention.wk.bias: requires_grad True, local_size torch.Size([1024]), model_parallel True, dtype torch.float32 +[01:45:45.845565] Param llma.layers.54.attention.wv.weight: requires_grad False, local_size torch.Size([1024, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.845576] Param llma.layers.54.attention.wv.bias: requires_grad True, local_size torch.Size([1024]), model_parallel True, dtype torch.float32 +[01:45:45.845589] Param llma.layers.54.attention.wo.weight: requires_grad False, local_size torch.Size([8192, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.845601] Param llma.layers.54.attention.wo.bias: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.845615] Param llma.layers.54.feed_forward.w1.weight: requires_grad False, local_size torch.Size([28672, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.845626] Param llma.layers.54.feed_forward.w1.bias: requires_grad True, local_size torch.Size([28672]), model_parallel True, dtype torch.float32 +[01:45:45.845639] Param llma.layers.54.feed_forward.w2.weight: requires_grad False, local_size torch.Size([8192, 28672]), model_parallel True, dtype torch.bfloat16 +[01:45:45.845651] Param llma.layers.54.feed_forward.w2.bias: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.845664] Param llma.layers.54.feed_forward.w3.weight: requires_grad False, local_size torch.Size([28672, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.845676] Param llma.layers.54.feed_forward.w3.bias: requires_grad True, local_size torch.Size([28672]), model_parallel True, dtype torch.float32 +[01:45:45.845689] Param llma.layers.54.attention_norm.weight: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.845702] Param llma.layers.54.ffn_norm.weight: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.845717] Param llma.layers.55.attention.wq.weight: requires_grad False, local_size torch.Size([8192, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.845729] Param llma.layers.55.attention.wq.bias: requires_grad True, local_size torch.Size([8192]), model_parallel True, dtype torch.float32 +[01:45:45.845743] Param llma.layers.55.attention.wk.weight: requires_grad False, local_size torch.Size([1024, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.845754] Param llma.layers.55.attention.wk.bias: requires_grad True, local_size torch.Size([1024]), model_parallel True, dtype torch.float32 +[01:45:45.845767] Param llma.layers.55.attention.wv.weight: requires_grad False, local_size torch.Size([1024, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.845779] Param llma.layers.55.attention.wv.bias: requires_grad True, local_size torch.Size([1024]), model_parallel True, dtype torch.float32 +[01:45:45.845792] Param llma.layers.55.attention.wo.weight: requires_grad False, local_size torch.Size([8192, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.845804] Param llma.layers.55.attention.wo.bias: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.845818] Param llma.layers.55.feed_forward.w1.weight: requires_grad False, local_size torch.Size([28672, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.845830] Param llma.layers.55.feed_forward.w1.bias: requires_grad True, local_size torch.Size([28672]), model_parallel True, dtype torch.float32 +[01:45:45.845843] Param llma.layers.55.feed_forward.w2.weight: requires_grad False, local_size torch.Size([8192, 28672]), model_parallel True, dtype torch.bfloat16 +[01:45:45.845855] Param llma.layers.55.feed_forward.w2.bias: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.845868] Param llma.layers.55.feed_forward.w3.weight: requires_grad False, local_size torch.Size([28672, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.845880] Param llma.layers.55.feed_forward.w3.bias: requires_grad True, local_size torch.Size([28672]), model_parallel True, dtype torch.float32 +[01:45:45.845893] Param llma.layers.55.attention_norm.weight: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.845907] Param llma.layers.55.ffn_norm.weight: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.845922] Param llma.layers.56.attention.wq.weight: requires_grad False, local_size torch.Size([8192, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.845934] Param llma.layers.56.attention.wq.bias: requires_grad True, local_size torch.Size([8192]), model_parallel True, dtype torch.float32 +[01:45:45.845947] Param llma.layers.56.attention.wk.weight: requires_grad False, local_size torch.Size([1024, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.845959] Param llma.layers.56.attention.wk.bias: requires_grad True, local_size torch.Size([1024]), model_parallel True, dtype torch.float32 +[01:45:45.845972] Param llma.layers.56.attention.wv.weight: requires_grad False, local_size torch.Size([1024, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.845984] Param llma.layers.56.attention.wv.bias: requires_grad True, local_size torch.Size([1024]), model_parallel True, dtype torch.float32 +[01:45:45.845997] Param llma.layers.56.attention.wo.weight: requires_grad False, local_size torch.Size([8192, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.846008] Param llma.layers.56.attention.wo.bias: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.846023] Param llma.layers.56.feed_forward.w1.weight: requires_grad False, local_size torch.Size([28672, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.846034] Param llma.layers.56.feed_forward.w1.bias: requires_grad True, local_size torch.Size([28672]), model_parallel True, dtype torch.float32 +[01:45:45.846047] Param llma.layers.56.feed_forward.w2.weight: requires_grad False, local_size torch.Size([8192, 28672]), model_parallel True, dtype torch.bfloat16 +[01:45:45.846067] Param llma.layers.56.feed_forward.w2.bias: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.846081] Param llma.layers.56.feed_forward.w3.weight: requires_grad False, local_size torch.Size([28672, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.846092] Param llma.layers.56.feed_forward.w3.bias: requires_grad True, local_size torch.Size([28672]), model_parallel True, dtype torch.float32 +[01:45:45.846105] Param llma.layers.56.attention_norm.weight: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.846119] Param llma.layers.56.ffn_norm.weight: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.846134] Param llma.layers.57.attention.wq.weight: requires_grad False, local_size torch.Size([8192, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.846146] Param llma.layers.57.attention.wq.bias: requires_grad True, local_size torch.Size([8192]), model_parallel True, dtype torch.float32 +[01:45:45.846159] Param llma.layers.57.attention.wk.weight: requires_grad False, local_size torch.Size([1024, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.846171] Param llma.layers.57.attention.wk.bias: requires_grad True, local_size torch.Size([1024]), model_parallel True, dtype torch.float32 +[01:45:45.846185] Param llma.layers.57.attention.wv.weight: requires_grad False, local_size torch.Size([1024, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.846196] Param llma.layers.57.attention.wv.bias: requires_grad True, local_size torch.Size([1024]), model_parallel True, dtype torch.float32 +[01:45:45.846210] Param llma.layers.57.attention.wo.weight: requires_grad False, local_size torch.Size([8192, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.846221] Param llma.layers.57.attention.wo.bias: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.846235] Param llma.layers.57.feed_forward.w1.weight: requires_grad False, local_size torch.Size([28672, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.846247] Param llma.layers.57.feed_forward.w1.bias: requires_grad True, local_size torch.Size([28672]), model_parallel True, dtype torch.float32 +[01:45:45.846260] Param llma.layers.57.feed_forward.w2.weight: requires_grad False, local_size torch.Size([8192, 28672]), model_parallel True, dtype torch.bfloat16 +[01:45:45.846272] Param llma.layers.57.feed_forward.w2.bias: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.846285] Param llma.layers.57.feed_forward.w3.weight: requires_grad False, local_size torch.Size([28672, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.846297] Param llma.layers.57.feed_forward.w3.bias: requires_grad True, local_size torch.Size([28672]), model_parallel True, dtype torch.float32 +[01:45:45.846310] Param llma.layers.57.attention_norm.weight: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.846323] Param llma.layers.57.ffn_norm.weight: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.846339] Param llma.layers.58.attention.wq.weight: requires_grad False, local_size torch.Size([8192, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.846351] Param llma.layers.58.attention.wq.bias: requires_grad True, local_size torch.Size([8192]), model_parallel True, dtype torch.float32 +[01:45:45.846364] Param llma.layers.58.attention.wk.weight: requires_grad False, local_size torch.Size([1024, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.846376] Param llma.layers.58.attention.wk.bias: requires_grad True, local_size torch.Size([1024]), model_parallel True, dtype torch.float32 +[01:45:45.846389] Param llma.layers.58.attention.wv.weight: requires_grad False, local_size torch.Size([1024, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.846401] Param llma.layers.58.attention.wv.bias: requires_grad True, local_size torch.Size([1024]), model_parallel True, dtype torch.float32 +[01:45:45.846414] Param llma.layers.58.attention.wo.weight: requires_grad False, local_size torch.Size([8192, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.846426] Param llma.layers.58.attention.wo.bias: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.846440] Param llma.layers.58.feed_forward.w1.weight: requires_grad False, local_size torch.Size([28672, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.846452] Param llma.layers.58.feed_forward.w1.bias: requires_grad True, local_size torch.Size([28672]), model_parallel True, dtype torch.float32 +[01:45:45.846465] Param llma.layers.58.feed_forward.w2.weight: requires_grad False, local_size torch.Size([8192, 28672]), model_parallel True, dtype torch.bfloat16 +[01:45:45.846477] Param llma.layers.58.feed_forward.w2.bias: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.846490] Param llma.layers.58.feed_forward.w3.weight: requires_grad False, local_size torch.Size([28672, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.846501] Param llma.layers.58.feed_forward.w3.bias: requires_grad True, local_size torch.Size([28672]), model_parallel True, dtype torch.float32 +[01:45:45.846515] Param llma.layers.58.attention_norm.weight: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.846528] Param llma.layers.58.ffn_norm.weight: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.846543] Param llma.layers.59.attention.wq.weight: requires_grad False, local_size torch.Size([8192, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.846555] Param llma.layers.59.attention.wq.bias: requires_grad True, local_size torch.Size([8192]), model_parallel True, dtype torch.float32 +[01:45:45.846568] Param llma.layers.59.attention.wk.weight: requires_grad False, local_size torch.Size([1024, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.846580] Param llma.layers.59.attention.wk.bias: requires_grad True, local_size torch.Size([1024]), model_parallel True, dtype torch.float32 +[01:45:45.846594] Param llma.layers.59.attention.wv.weight: requires_grad False, local_size torch.Size([1024, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.846605] Param llma.layers.59.attention.wv.bias: requires_grad True, local_size torch.Size([1024]), model_parallel True, dtype torch.float32 +[01:45:45.846618] Param llma.layers.59.attention.wo.weight: requires_grad False, local_size torch.Size([8192, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.846630] Param llma.layers.59.attention.wo.bias: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.846644] Param llma.layers.59.feed_forward.w1.weight: requires_grad False, local_size torch.Size([28672, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.846656] Param llma.layers.59.feed_forward.w1.bias: requires_grad True, local_size torch.Size([28672]), model_parallel True, dtype torch.float32 +[01:45:45.846669] Param llma.layers.59.feed_forward.w2.weight: requires_grad False, local_size torch.Size([8192, 28672]), model_parallel True, dtype torch.bfloat16 +[01:45:45.846681] Param llma.layers.59.feed_forward.w2.bias: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.846695] Param llma.layers.59.feed_forward.w3.weight: requires_grad False, local_size torch.Size([28672, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.846714] Param llma.layers.59.feed_forward.w3.bias: requires_grad True, local_size torch.Size([28672]), model_parallel True, dtype torch.float32 +[01:45:45.846727] Param llma.layers.59.attention_norm.weight: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.846741] Param llma.layers.59.ffn_norm.weight: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.846756] Param llma.layers.60.attention.wq.weight: requires_grad False, local_size torch.Size([8192, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.846768] Param llma.layers.60.attention.wq.bias: requires_grad True, local_size torch.Size([8192]), model_parallel True, dtype torch.float32 +[01:45:45.846781] Param llma.layers.60.attention.wk.weight: requires_grad False, local_size torch.Size([1024, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.846793] Param llma.layers.60.attention.wk.bias: requires_grad True, local_size torch.Size([1024]), model_parallel True, dtype torch.float32 +[01:45:45.846806] Param llma.layers.60.attention.wv.weight: requires_grad False, local_size torch.Size([1024, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.846818] Param llma.layers.60.attention.wv.bias: requires_grad True, local_size torch.Size([1024]), model_parallel True, dtype torch.float32 +[01:45:45.846831] Param llma.layers.60.attention.wo.weight: requires_grad False, local_size torch.Size([8192, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.846843] Param llma.layers.60.attention.wo.bias: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.846857] Param llma.layers.60.feed_forward.w1.weight: requires_grad False, local_size torch.Size([28672, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.846868] Param llma.layers.60.feed_forward.w1.bias: requires_grad True, local_size torch.Size([28672]), model_parallel True, dtype torch.float32 +[01:45:45.846881] Param llma.layers.60.feed_forward.w2.weight: requires_grad False, local_size torch.Size([8192, 28672]), model_parallel True, dtype torch.bfloat16 +[01:45:45.846893] Param llma.layers.60.feed_forward.w2.bias: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.846906] Param llma.layers.60.feed_forward.w3.weight: requires_grad False, local_size torch.Size([28672, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.846918] Param llma.layers.60.feed_forward.w3.bias: requires_grad True, local_size torch.Size([28672]), model_parallel True, dtype torch.float32 +[01:45:45.846931] Param llma.layers.60.attention_norm.weight: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.846944] Param llma.layers.60.ffn_norm.weight: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.846959] Param llma.layers.61.attention.wq.weight: requires_grad False, local_size torch.Size([8192, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.846971] Param llma.layers.61.attention.wq.bias: requires_grad True, local_size torch.Size([8192]), model_parallel True, dtype torch.float32 +[01:45:45.846984] Param llma.layers.61.attention.wk.weight: requires_grad False, local_size torch.Size([1024, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.846996] Param llma.layers.61.attention.wk.bias: requires_grad True, local_size torch.Size([1024]), model_parallel True, dtype torch.float32 +[01:45:45.847009] Param llma.layers.61.attention.wv.weight: requires_grad False, local_size torch.Size([1024, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.847021] Param llma.layers.61.attention.wv.bias: requires_grad True, local_size torch.Size([1024]), model_parallel True, dtype torch.float32 +[01:45:45.847034] Param llma.layers.61.attention.wo.weight: requires_grad False, local_size torch.Size([8192, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.847046] Param llma.layers.61.attention.wo.bias: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.847068] Param llma.layers.61.feed_forward.w1.weight: requires_grad False, local_size torch.Size([28672, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.847080] Param llma.layers.61.feed_forward.w1.bias: requires_grad True, local_size torch.Size([28672]), model_parallel True, dtype torch.float32 +[01:45:45.847093] Param llma.layers.61.feed_forward.w2.weight: requires_grad False, local_size torch.Size([8192, 28672]), model_parallel True, dtype torch.bfloat16 +[01:45:45.847105] Param llma.layers.61.feed_forward.w2.bias: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.847118] Param llma.layers.61.feed_forward.w3.weight: requires_grad False, local_size torch.Size([28672, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.847130] Param llma.layers.61.feed_forward.w3.bias: requires_grad True, local_size torch.Size([28672]), model_parallel True, dtype torch.float32 +[01:45:45.847143] Param llma.layers.61.attention_norm.weight: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.847156] Param llma.layers.61.ffn_norm.weight: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.847171] Param llma.layers.62.attention.wq.weight: requires_grad False, local_size torch.Size([8192, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.847183] Param llma.layers.62.attention.wq.bias: requires_grad True, local_size torch.Size([8192]), model_parallel True, dtype torch.float32 +[01:45:45.847196] Param llma.layers.62.attention.wk.weight: requires_grad False, local_size torch.Size([1024, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.847208] Param llma.layers.62.attention.wk.bias: requires_grad True, local_size torch.Size([1024]), model_parallel True, dtype torch.float32 +[01:45:45.847221] Param llma.layers.62.attention.wv.weight: requires_grad False, local_size torch.Size([1024, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.847233] Param llma.layers.62.attention.wv.bias: requires_grad True, local_size torch.Size([1024]), model_parallel True, dtype torch.float32 +[01:45:45.847246] Param llma.layers.62.attention.wo.weight: requires_grad False, local_size torch.Size([8192, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.847258] Param llma.layers.62.attention.wo.bias: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.847272] Param llma.layers.62.feed_forward.w1.weight: requires_grad False, local_size torch.Size([28672, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.847283] Param llma.layers.62.feed_forward.w1.bias: requires_grad True, local_size torch.Size([28672]), model_parallel True, dtype torch.float32 +[01:45:45.847296] Param llma.layers.62.feed_forward.w2.weight: requires_grad False, local_size torch.Size([8192, 28672]), model_parallel True, dtype torch.bfloat16 +[01:45:45.847308] Param llma.layers.62.feed_forward.w2.bias: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.847321] Param llma.layers.62.feed_forward.w3.weight: requires_grad False, local_size torch.Size([28672, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.847333] Param llma.layers.62.feed_forward.w3.bias: requires_grad True, local_size torch.Size([28672]), model_parallel True, dtype torch.float32 +[01:45:45.847345] Param llma.layers.62.attention_norm.weight: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.847358] Param llma.layers.62.ffn_norm.weight: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.847373] Param llma.layers.63.attention.wq.weight: requires_grad False, local_size torch.Size([8192, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.847385] Param llma.layers.63.attention.wq.bias: requires_grad True, local_size torch.Size([8192]), model_parallel True, dtype torch.float32 +[01:45:45.847399] Param llma.layers.63.attention.wk.weight: requires_grad False, local_size torch.Size([1024, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.847411] Param llma.layers.63.attention.wk.bias: requires_grad True, local_size torch.Size([1024]), model_parallel True, dtype torch.float32 +[01:45:45.847425] Param llma.layers.63.attention.wv.weight: requires_grad False, local_size torch.Size([1024, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.847437] Param llma.layers.63.attention.wv.bias: requires_grad True, local_size torch.Size([1024]), model_parallel True, dtype torch.float32 +[01:45:45.847451] Param llma.layers.63.attention.wo.weight: requires_grad False, local_size torch.Size([8192, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.847462] Param llma.layers.63.attention.wo.bias: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.847477] Param llma.layers.63.feed_forward.w1.weight: requires_grad False, local_size torch.Size([28672, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.847489] Param llma.layers.63.feed_forward.w1.bias: requires_grad True, local_size torch.Size([28672]), model_parallel True, dtype torch.float32 +[01:45:45.847502] Param llma.layers.63.feed_forward.w2.weight: requires_grad False, local_size torch.Size([8192, 28672]), model_parallel True, dtype torch.bfloat16 +[01:45:45.847514] Param llma.layers.63.feed_forward.w2.bias: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.847527] Param llma.layers.63.feed_forward.w3.weight: requires_grad False, local_size torch.Size([28672, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.847538] Param llma.layers.63.feed_forward.w3.bias: requires_grad True, local_size torch.Size([28672]), model_parallel True, dtype torch.float32 +[01:45:45.847551] Param llma.layers.63.attention_norm.weight: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.847564] Param llma.layers.63.ffn_norm.weight: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.847579] Param llma.layers.64.attention.wq.weight: requires_grad False, local_size torch.Size([8192, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.847591] Param llma.layers.64.attention.wq.bias: requires_grad True, local_size torch.Size([8192]), model_parallel True, dtype torch.float32 +[01:45:45.847604] Param llma.layers.64.attention.wk.weight: requires_grad False, local_size torch.Size([1024, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.847615] Param llma.layers.64.attention.wk.bias: requires_grad True, local_size torch.Size([1024]), model_parallel True, dtype torch.float32 +[01:45:45.847628] Param llma.layers.64.attention.wv.weight: requires_grad False, local_size torch.Size([1024, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.847640] Param llma.layers.64.attention.wv.bias: requires_grad True, local_size torch.Size([1024]), model_parallel True, dtype torch.float32 +[01:45:45.847653] Param llma.layers.64.attention.wo.weight: requires_grad False, local_size torch.Size([8192, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.847665] Param llma.layers.64.attention.wo.bias: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.847679] Param llma.layers.64.feed_forward.w1.weight: requires_grad False, local_size torch.Size([28672, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.847691] Param llma.layers.64.feed_forward.w1.bias: requires_grad True, local_size torch.Size([28672]), model_parallel True, dtype torch.float32 +[01:45:45.847704] Param llma.layers.64.feed_forward.w2.weight: requires_grad False, local_size torch.Size([8192, 28672]), model_parallel True, dtype torch.bfloat16 +[01:45:45.847716] Param llma.layers.64.feed_forward.w2.bias: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.847729] Param llma.layers.64.feed_forward.w3.weight: requires_grad False, local_size torch.Size([28672, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.847740] Param llma.layers.64.feed_forward.w3.bias: requires_grad True, local_size torch.Size([28672]), model_parallel True, dtype torch.float32 +[01:45:45.847753] Param llma.layers.64.attention_norm.weight: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.847766] Param llma.layers.64.ffn_norm.weight: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.847781] Param llma.layers.65.attention.wq.weight: requires_grad False, local_size torch.Size([8192, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.847792] Param llma.layers.65.attention.wq.bias: requires_grad True, local_size torch.Size([8192]), model_parallel True, dtype torch.float32 +[01:45:45.847805] Param llma.layers.65.attention.wk.weight: requires_grad False, local_size torch.Size([1024, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.847817] Param llma.layers.65.attention.wk.bias: requires_grad True, local_size torch.Size([1024]), model_parallel True, dtype torch.float32 +[01:45:45.847830] Param llma.layers.65.attention.wv.weight: requires_grad False, local_size torch.Size([1024, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.847842] Param llma.layers.65.attention.wv.bias: requires_grad True, local_size torch.Size([1024]), model_parallel True, dtype torch.float32 +[01:45:45.847855] Param llma.layers.65.attention.wo.weight: requires_grad False, local_size torch.Size([8192, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.847866] Param llma.layers.65.attention.wo.bias: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.847880] Param llma.layers.65.feed_forward.w1.weight: requires_grad False, local_size torch.Size([28672, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.847892] Param llma.layers.65.feed_forward.w1.bias: requires_grad True, local_size torch.Size([28672]), model_parallel True, dtype torch.float32 +[01:45:45.847905] Param llma.layers.65.feed_forward.w2.weight: requires_grad False, local_size torch.Size([8192, 28672]), model_parallel True, dtype torch.bfloat16 +[01:45:45.847917] Param llma.layers.65.feed_forward.w2.bias: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.847930] Param llma.layers.65.feed_forward.w3.weight: requires_grad False, local_size torch.Size([28672, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.847941] Param llma.layers.65.feed_forward.w3.bias: requires_grad True, local_size torch.Size([28672]), model_parallel True, dtype torch.float32 +[01:45:45.847954] Param llma.layers.65.attention_norm.weight: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.847967] Param llma.layers.65.ffn_norm.weight: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.847982] Param llma.layers.66.attention.wq.weight: requires_grad False, local_size torch.Size([8192, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.847994] Param llma.layers.66.attention.wq.bias: requires_grad True, local_size torch.Size([8192]), model_parallel True, dtype torch.float32 +[01:45:45.848007] Param llma.layers.66.attention.wk.weight: requires_grad False, local_size torch.Size([1024, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.848018] Param llma.layers.66.attention.wk.bias: requires_grad True, local_size torch.Size([1024]), model_parallel True, dtype torch.float32 +[01:45:45.848031] Param llma.layers.66.attention.wv.weight: requires_grad False, local_size torch.Size([1024, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.848043] Param llma.layers.66.attention.wv.bias: requires_grad True, local_size torch.Size([1024]), model_parallel True, dtype torch.float32 +[01:45:45.848064] Param llma.layers.66.attention.wo.weight: requires_grad False, local_size torch.Size([8192, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.848076] Param llma.layers.66.attention.wo.bias: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.848090] Param llma.layers.66.feed_forward.w1.weight: requires_grad False, local_size torch.Size([28672, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.848102] Param llma.layers.66.feed_forward.w1.bias: requires_grad True, local_size torch.Size([28672]), model_parallel True, dtype torch.float32 +[01:45:45.848115] Param llma.layers.66.feed_forward.w2.weight: requires_grad False, local_size torch.Size([8192, 28672]), model_parallel True, dtype torch.bfloat16 +[01:45:45.848127] Param llma.layers.66.feed_forward.w2.bias: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.848140] Param llma.layers.66.feed_forward.w3.weight: requires_grad False, local_size torch.Size([28672, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.848152] Param llma.layers.66.feed_forward.w3.bias: requires_grad True, local_size torch.Size([28672]), model_parallel True, dtype torch.float32 +[01:45:45.848165] Param llma.layers.66.attention_norm.weight: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.848178] Param llma.layers.66.ffn_norm.weight: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.848193] Param llma.layers.67.attention.wq.weight: requires_grad False, local_size torch.Size([8192, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.848205] Param llma.layers.67.attention.wq.bias: requires_grad True, local_size torch.Size([8192]), model_parallel True, dtype torch.float32 +[01:45:45.848218] Param llma.layers.67.attention.wk.weight: requires_grad False, local_size torch.Size([1024, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.848230] Param llma.layers.67.attention.wk.bias: requires_grad True, local_size torch.Size([1024]), model_parallel True, dtype torch.float32 +[01:45:45.848243] Param llma.layers.67.attention.wv.weight: requires_grad False, local_size torch.Size([1024, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.848255] Param llma.layers.67.attention.wv.bias: requires_grad True, local_size torch.Size([1024]), model_parallel True, dtype torch.float32 +[01:45:45.848268] Param llma.layers.67.attention.wo.weight: requires_grad False, local_size torch.Size([8192, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.848279] Param llma.layers.67.attention.wo.bias: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.848294] Param llma.layers.67.feed_forward.w1.weight: requires_grad False, local_size torch.Size([28672, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.848305] Param llma.layers.67.feed_forward.w1.bias: requires_grad True, local_size torch.Size([28672]), model_parallel True, dtype torch.float32 +[01:45:45.848319] Param llma.layers.67.feed_forward.w2.weight: requires_grad False, local_size torch.Size([8192, 28672]), model_parallel True, dtype torch.bfloat16 +[01:45:45.848331] Param llma.layers.67.feed_forward.w2.bias: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.848344] Param llma.layers.67.feed_forward.w3.weight: requires_grad False, local_size torch.Size([28672, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.848356] Param llma.layers.67.feed_forward.w3.bias: requires_grad True, local_size torch.Size([28672]), model_parallel True, dtype torch.float32 +[01:45:45.848369] Param llma.layers.67.attention_norm.weight: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.848382] Param llma.layers.67.ffn_norm.weight: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.848396] Param llma.layers.68.attention.wq.weight: requires_grad False, local_size torch.Size([8192, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.848408] Param llma.layers.68.attention.wq.bias: requires_grad True, local_size torch.Size([8192]), model_parallel True, dtype torch.float32 +[01:45:45.848421] Param llma.layers.68.attention.wk.weight: requires_grad False, local_size torch.Size([1024, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.848433] Param llma.layers.68.attention.wk.bias: requires_grad True, local_size torch.Size([1024]), model_parallel True, dtype torch.float32 +[01:45:45.848447] Param llma.layers.68.attention.wv.weight: requires_grad False, local_size torch.Size([1024, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.848459] Param llma.layers.68.attention.wv.bias: requires_grad True, local_size torch.Size([1024]), model_parallel True, dtype torch.float32 +[01:45:45.848472] Param llma.layers.68.attention.wo.weight: requires_grad False, local_size torch.Size([8192, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.848483] Param llma.layers.68.attention.wo.bias: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.848497] Param llma.layers.68.feed_forward.w1.weight: requires_grad False, local_size torch.Size([28672, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.848509] Param llma.layers.68.feed_forward.w1.bias: requires_grad True, local_size torch.Size([28672]), model_parallel True, dtype torch.float32 +[01:45:45.848522] Param llma.layers.68.feed_forward.w2.weight: requires_grad False, local_size torch.Size([8192, 28672]), model_parallel True, dtype torch.bfloat16 +[01:45:45.848534] Param llma.layers.68.feed_forward.w2.bias: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.848547] Param llma.layers.68.feed_forward.w3.weight: requires_grad False, local_size torch.Size([28672, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.848558] Param llma.layers.68.feed_forward.w3.bias: requires_grad True, local_size torch.Size([28672]), model_parallel True, dtype torch.float32 +[01:45:45.848571] Param llma.layers.68.attention_norm.weight: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.848584] Param llma.layers.68.ffn_norm.weight: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.848599] Param llma.layers.69.attention.wq.weight: requires_grad False, local_size torch.Size([8192, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.848610] Param llma.layers.69.attention.wq.bias: requires_grad True, local_size torch.Size([8192]), model_parallel True, dtype torch.float32 +[01:45:45.848623] Param llma.layers.69.attention.wk.weight: requires_grad False, local_size torch.Size([1024, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.848635] Param llma.layers.69.attention.wk.bias: requires_grad True, local_size torch.Size([1024]), model_parallel True, dtype torch.float32 +[01:45:45.848647] Param llma.layers.69.attention.wv.weight: requires_grad False, local_size torch.Size([1024, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.848659] Param llma.layers.69.attention.wv.bias: requires_grad True, local_size torch.Size([1024]), model_parallel True, dtype torch.float32 +[01:45:45.848672] Param llma.layers.69.attention.wo.weight: requires_grad False, local_size torch.Size([8192, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.848684] Param llma.layers.69.attention.wo.bias: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.848698] Param llma.layers.69.feed_forward.w1.weight: requires_grad False, local_size torch.Size([28672, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.848709] Param llma.layers.69.feed_forward.w1.bias: requires_grad True, local_size torch.Size([28672]), model_parallel True, dtype torch.float32 +[01:45:45.848722] Param llma.layers.69.feed_forward.w2.weight: requires_grad False, local_size torch.Size([8192, 28672]), model_parallel True, dtype torch.bfloat16 +[01:45:45.848734] Param llma.layers.69.feed_forward.w2.bias: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.848747] Param llma.layers.69.feed_forward.w3.weight: requires_grad False, local_size torch.Size([28672, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.848759] Param llma.layers.69.feed_forward.w3.bias: requires_grad True, local_size torch.Size([28672]), model_parallel True, dtype torch.float32 +[01:45:45.848772] Param llma.layers.69.attention_norm.weight: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.848784] Param llma.layers.69.ffn_norm.weight: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.848800] Param llma.layers.70.attention.wq.weight: requires_grad False, local_size torch.Size([8192, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.848811] Param llma.layers.70.attention.wq.bias: requires_grad True, local_size torch.Size([8192]), model_parallel True, dtype torch.float32 +[01:45:45.848825] Param llma.layers.70.attention.wk.weight: requires_grad False, local_size torch.Size([1024, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.848836] Param llma.layers.70.attention.wk.bias: requires_grad True, local_size torch.Size([1024]), model_parallel True, dtype torch.float32 +[01:45:45.848849] Param llma.layers.70.attention.wv.weight: requires_grad False, local_size torch.Size([1024, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.848861] Param llma.layers.70.attention.wv.bias: requires_grad True, local_size torch.Size([1024]), model_parallel True, dtype torch.float32 +[01:45:45.848874] Param llma.layers.70.attention.wo.weight: requires_grad False, local_size torch.Size([8192, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.848886] Param llma.layers.70.attention.wo.bias: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.848900] Param llma.layers.70.feed_forward.w1.weight: requires_grad False, local_size torch.Size([28672, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.848911] Param llma.layers.70.feed_forward.w1.bias: requires_grad True, local_size torch.Size([28672]), model_parallel True, dtype torch.float32 +[01:45:45.848924] Param llma.layers.70.feed_forward.w2.weight: requires_grad False, local_size torch.Size([8192, 28672]), model_parallel True, dtype torch.bfloat16 +[01:45:45.848936] Param llma.layers.70.feed_forward.w2.bias: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.848949] Param llma.layers.70.feed_forward.w3.weight: requires_grad False, local_size torch.Size([28672, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.848961] Param llma.layers.70.feed_forward.w3.bias: requires_grad True, local_size torch.Size([28672]), model_parallel True, dtype torch.float32 +[01:45:45.848973] Param llma.layers.70.attention_norm.weight: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.848986] Param llma.layers.70.ffn_norm.weight: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.849001] Param llma.layers.71.attention.wq.weight: requires_grad False, local_size torch.Size([8192, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.849013] Param llma.layers.71.attention.wq.bias: requires_grad True, local_size torch.Size([8192]), model_parallel True, dtype torch.float32 +[01:45:45.849026] Param llma.layers.71.attention.wk.weight: requires_grad False, local_size torch.Size([1024, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.849038] Param llma.layers.71.attention.wk.bias: requires_grad True, local_size torch.Size([1024]), model_parallel True, dtype torch.float32 +[01:45:45.849059] Param llma.layers.71.attention.wv.weight: requires_grad False, local_size torch.Size([1024, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.849072] Param llma.layers.71.attention.wv.bias: requires_grad True, local_size torch.Size([1024]), model_parallel True, dtype torch.float32 +[01:45:45.849084] Param llma.layers.71.attention.wo.weight: requires_grad False, local_size torch.Size([8192, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.849096] Param llma.layers.71.attention.wo.bias: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.849110] Param llma.layers.71.feed_forward.w1.weight: requires_grad False, local_size torch.Size([28672, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.849122] Param llma.layers.71.feed_forward.w1.bias: requires_grad True, local_size torch.Size([28672]), model_parallel True, dtype torch.float32 +[01:45:45.849135] Param llma.layers.71.feed_forward.w2.weight: requires_grad False, local_size torch.Size([8192, 28672]), model_parallel True, dtype torch.bfloat16 +[01:45:45.849146] Param llma.layers.71.feed_forward.w2.bias: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.849160] Param llma.layers.71.feed_forward.w3.weight: requires_grad False, local_size torch.Size([28672, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.849171] Param llma.layers.71.feed_forward.w3.bias: requires_grad True, local_size torch.Size([28672]), model_parallel True, dtype torch.float32 +[01:45:45.849184] Param llma.layers.71.attention_norm.weight: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.849197] Param llma.layers.71.ffn_norm.weight: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.849212] Param llma.layers.72.attention.wq.weight: requires_grad False, local_size torch.Size([8192, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.849223] Param llma.layers.72.attention.wq.bias: requires_grad True, local_size torch.Size([8192]), model_parallel True, dtype torch.float32 +[01:45:45.849237] Param llma.layers.72.attention.wk.weight: requires_grad False, local_size torch.Size([1024, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.849248] Param llma.layers.72.attention.wk.bias: requires_grad True, local_size torch.Size([1024]), model_parallel True, dtype torch.float32 +[01:45:45.849261] Param llma.layers.72.attention.wv.weight: requires_grad False, local_size torch.Size([1024, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.849273] Param llma.layers.72.attention.wv.bias: requires_grad True, local_size torch.Size([1024]), model_parallel True, dtype torch.float32 +[01:45:45.849286] Param llma.layers.72.attention.wo.weight: requires_grad False, local_size torch.Size([8192, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.849298] Param llma.layers.72.attention.wo.bias: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.849312] Param llma.layers.72.feed_forward.w1.weight: requires_grad False, local_size torch.Size([28672, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.849323] Param llma.layers.72.feed_forward.w1.bias: requires_grad True, local_size torch.Size([28672]), model_parallel True, dtype torch.float32 +[01:45:45.849336] Param llma.layers.72.feed_forward.w2.weight: requires_grad False, local_size torch.Size([8192, 28672]), model_parallel True, dtype torch.bfloat16 +[01:45:45.849348] Param llma.layers.72.feed_forward.w2.bias: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.849361] Param llma.layers.72.feed_forward.w3.weight: requires_grad False, local_size torch.Size([28672, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.849373] Param llma.layers.72.feed_forward.w3.bias: requires_grad True, local_size torch.Size([28672]), model_parallel True, dtype torch.float32 +[01:45:45.849386] Param llma.layers.72.attention_norm.weight: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.849398] Param llma.layers.72.ffn_norm.weight: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.849414] Param llma.layers.73.attention.wq.weight: requires_grad False, local_size torch.Size([8192, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.849425] Param llma.layers.73.attention.wq.bias: requires_grad True, local_size torch.Size([8192]), model_parallel True, dtype torch.float32 +[01:45:45.849439] Param llma.layers.73.attention.wk.weight: requires_grad False, local_size torch.Size([1024, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.849450] Param llma.layers.73.attention.wk.bias: requires_grad True, local_size torch.Size([1024]), model_parallel True, dtype torch.float32 +[01:45:45.849463] Param llma.layers.73.attention.wv.weight: requires_grad False, local_size torch.Size([1024, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.849475] Param llma.layers.73.attention.wv.bias: requires_grad True, local_size torch.Size([1024]), model_parallel True, dtype torch.float32 +[01:45:45.849489] Param llma.layers.73.attention.wo.weight: requires_grad False, local_size torch.Size([8192, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.849501] Param llma.layers.73.attention.wo.bias: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.849514] Param llma.layers.73.feed_forward.w1.weight: requires_grad False, local_size torch.Size([28672, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.849526] Param llma.layers.73.feed_forward.w1.bias: requires_grad True, local_size torch.Size([28672]), model_parallel True, dtype torch.float32 +[01:45:45.849539] Param llma.layers.73.feed_forward.w2.weight: requires_grad False, local_size torch.Size([8192, 28672]), model_parallel True, dtype torch.bfloat16 +[01:45:45.849551] Param llma.layers.73.feed_forward.w2.bias: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.849564] Param llma.layers.73.feed_forward.w3.weight: requires_grad False, local_size torch.Size([28672, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.849575] Param llma.layers.73.feed_forward.w3.bias: requires_grad True, local_size torch.Size([28672]), model_parallel True, dtype torch.float32 +[01:45:45.849588] Param llma.layers.73.attention_norm.weight: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.849601] Param llma.layers.73.ffn_norm.weight: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.849615] Param llma.layers.74.attention.wq.weight: requires_grad False, local_size torch.Size([8192, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.849627] Param llma.layers.74.attention.wq.bias: requires_grad True, local_size torch.Size([8192]), model_parallel True, dtype torch.float32 +[01:45:45.849640] Param llma.layers.74.attention.wk.weight: requires_grad False, local_size torch.Size([1024, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.849651] Param llma.layers.74.attention.wk.bias: requires_grad True, local_size torch.Size([1024]), model_parallel True, dtype torch.float32 +[01:45:45.849664] Param llma.layers.74.attention.wv.weight: requires_grad False, local_size torch.Size([1024, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.849676] Param llma.layers.74.attention.wv.bias: requires_grad True, local_size torch.Size([1024]), model_parallel True, dtype torch.float32 +[01:45:45.849689] Param llma.layers.74.attention.wo.weight: requires_grad False, local_size torch.Size([8192, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.849701] Param llma.layers.74.attention.wo.bias: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.849715] Param llma.layers.74.feed_forward.w1.weight: requires_grad False, local_size torch.Size([28672, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.849726] Param llma.layers.74.feed_forward.w1.bias: requires_grad True, local_size torch.Size([28672]), model_parallel True, dtype torch.float32 +[01:45:45.849739] Param llma.layers.74.feed_forward.w2.weight: requires_grad False, local_size torch.Size([8192, 28672]), model_parallel True, dtype torch.bfloat16 +[01:45:45.849751] Param llma.layers.74.feed_forward.w2.bias: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.849764] Param llma.layers.74.feed_forward.w3.weight: requires_grad False, local_size torch.Size([28672, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.849776] Param llma.layers.74.feed_forward.w3.bias: requires_grad True, local_size torch.Size([28672]), model_parallel True, dtype torch.float32 +[01:45:45.849789] Param llma.layers.74.attention_norm.weight: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.849801] Param llma.layers.74.ffn_norm.weight: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.849816] Param llma.layers.75.attention.wq.weight: requires_grad False, local_size torch.Size([8192, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.849828] Param llma.layers.75.attention.wq.bias: requires_grad True, local_size torch.Size([8192]), model_parallel True, dtype torch.float32 +[01:45:45.849841] Param llma.layers.75.attention.wk.weight: requires_grad False, local_size torch.Size([1024, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.849853] Param llma.layers.75.attention.wk.bias: requires_grad True, local_size torch.Size([1024]), model_parallel True, dtype torch.float32 +[01:45:45.849866] Param llma.layers.75.attention.wv.weight: requires_grad False, local_size torch.Size([1024, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.849877] Param llma.layers.75.attention.wv.bias: requires_grad True, local_size torch.Size([1024]), model_parallel True, dtype torch.float32 +[01:45:45.849890] Param llma.layers.75.attention.wo.weight: requires_grad False, local_size torch.Size([8192, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.849902] Param llma.layers.75.attention.wo.bias: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.849915] Param llma.layers.75.feed_forward.w1.weight: requires_grad False, local_size torch.Size([28672, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.849927] Param llma.layers.75.feed_forward.w1.bias: requires_grad True, local_size torch.Size([28672]), model_parallel True, dtype torch.float32 +[01:45:45.849940] Param llma.layers.75.feed_forward.w2.weight: requires_grad False, local_size torch.Size([8192, 28672]), model_parallel True, dtype torch.bfloat16 +[01:45:45.849952] Param llma.layers.75.feed_forward.w2.bias: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.849965] Param llma.layers.75.feed_forward.w3.weight: requires_grad False, local_size torch.Size([28672, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.849977] Param llma.layers.75.feed_forward.w3.bias: requires_grad True, local_size torch.Size([28672]), model_parallel True, dtype torch.float32 +[01:45:45.849990] Param llma.layers.75.attention_norm.weight: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.850003] Param llma.layers.75.ffn_norm.weight: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.850018] Param llma.layers.76.attention.wq.weight: requires_grad False, local_size torch.Size([8192, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.850030] Param llma.layers.76.attention.wq.bias: requires_grad True, local_size torch.Size([8192]), model_parallel True, dtype torch.float32 +[01:45:45.850043] Param llma.layers.76.attention.wk.weight: requires_grad False, local_size torch.Size([1024, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.850062] Param llma.layers.76.attention.wk.bias: requires_grad True, local_size torch.Size([1024]), model_parallel True, dtype torch.float32 +[01:45:45.850076] Param llma.layers.76.attention.wv.weight: requires_grad False, local_size torch.Size([1024, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.850087] Param llma.layers.76.attention.wv.bias: requires_grad True, local_size torch.Size([1024]), model_parallel True, dtype torch.float32 +[01:45:45.850100] Param llma.layers.76.attention.wo.weight: requires_grad False, local_size torch.Size([8192, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.850112] Param llma.layers.76.attention.wo.bias: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.850126] Param llma.layers.76.feed_forward.w1.weight: requires_grad False, local_size torch.Size([28672, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.850138] Param llma.layers.76.feed_forward.w1.bias: requires_grad True, local_size torch.Size([28672]), model_parallel True, dtype torch.float32 +[01:45:45.850151] Param llma.layers.76.feed_forward.w2.weight: requires_grad False, local_size torch.Size([8192, 28672]), model_parallel True, dtype torch.bfloat16 +[01:45:45.850179] Param llma.layers.76.feed_forward.w2.bias: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.850192] Param llma.layers.76.feed_forward.w3.weight: requires_grad False, local_size torch.Size([28672, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.850204] Param llma.layers.76.feed_forward.w3.bias: requires_grad True, local_size torch.Size([28672]), model_parallel True, dtype torch.float32 +[01:45:45.850217] Param llma.layers.76.attention_norm.weight: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.850230] Param llma.layers.76.ffn_norm.weight: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.850245] Param llma.layers.77.attention.wq.weight: requires_grad False, local_size torch.Size([8192, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.850257] Param llma.layers.77.attention.wq.bias: requires_grad True, local_size torch.Size([8192]), model_parallel True, dtype torch.float32 +[01:45:45.850270] Param llma.layers.77.attention.wk.weight: requires_grad False, local_size torch.Size([1024, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.850282] Param llma.layers.77.attention.wk.bias: requires_grad True, local_size torch.Size([1024]), model_parallel True, dtype torch.float32 +[01:45:45.850295] Param llma.layers.77.attention.wv.weight: requires_grad False, local_size torch.Size([1024, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.850306] Param llma.layers.77.attention.wv.bias: requires_grad True, local_size torch.Size([1024]), model_parallel True, dtype torch.float32 +[01:45:45.850319] Param llma.layers.77.attention.wo.weight: requires_grad False, local_size torch.Size([8192, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.850331] Param llma.layers.77.attention.wo.bias: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.850345] Param llma.layers.77.feed_forward.w1.weight: requires_grad False, local_size torch.Size([28672, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.850357] Param llma.layers.77.feed_forward.w1.bias: requires_grad True, local_size torch.Size([28672]), model_parallel True, dtype torch.float32 +[01:45:45.850370] Param llma.layers.77.feed_forward.w2.weight: requires_grad False, local_size torch.Size([8192, 28672]), model_parallel True, dtype torch.bfloat16 +[01:45:45.850381] Param llma.layers.77.feed_forward.w2.bias: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.850395] Param llma.layers.77.feed_forward.w3.weight: requires_grad False, local_size torch.Size([28672, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.850406] Param llma.layers.77.feed_forward.w3.bias: requires_grad True, local_size torch.Size([28672]), model_parallel True, dtype torch.float32 +[01:45:45.850419] Param llma.layers.77.attention_norm.weight: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.850432] Param llma.layers.77.ffn_norm.weight: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.850447] Param llma.layers.78.attention.wq.weight: requires_grad False, local_size torch.Size([8192, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.850458] Param llma.layers.78.attention.wq.bias: requires_grad True, local_size torch.Size([8192]), model_parallel True, dtype torch.float32 +[01:45:45.850471] Param llma.layers.78.attention.wk.weight: requires_grad False, local_size torch.Size([1024, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.850483] Param llma.layers.78.attention.wk.bias: requires_grad True, local_size torch.Size([1024]), model_parallel True, dtype torch.float32 +[01:45:45.850496] Param llma.layers.78.attention.wv.weight: requires_grad False, local_size torch.Size([1024, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.850508] Param llma.layers.78.attention.wv.bias: requires_grad True, local_size torch.Size([1024]), model_parallel True, dtype torch.float32 +[01:45:45.850521] Param llma.layers.78.attention.wo.weight: requires_grad False, local_size torch.Size([8192, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.850532] Param llma.layers.78.attention.wo.bias: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.850546] Param llma.layers.78.feed_forward.w1.weight: requires_grad False, local_size torch.Size([28672, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.850558] Param llma.layers.78.feed_forward.w1.bias: requires_grad True, local_size torch.Size([28672]), model_parallel True, dtype torch.float32 +[01:45:45.850571] Param llma.layers.78.feed_forward.w2.weight: requires_grad False, local_size torch.Size([8192, 28672]), model_parallel True, dtype torch.bfloat16 +[01:45:45.850583] Param llma.layers.78.feed_forward.w2.bias: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.850596] Param llma.layers.78.feed_forward.w3.weight: requires_grad False, local_size torch.Size([28672, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.850607] Param llma.layers.78.feed_forward.w3.bias: requires_grad True, local_size torch.Size([28672]), model_parallel True, dtype torch.float32 +[01:45:45.850620] Param llma.layers.78.attention_norm.weight: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.850633] Param llma.layers.78.ffn_norm.weight: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.850648] Param llma.layers.79.attention.wq.weight: requires_grad False, local_size torch.Size([8192, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.850659] Param llma.layers.79.attention.wq.bias: requires_grad True, local_size torch.Size([8192]), model_parallel True, dtype torch.float32 +[01:45:45.850673] Param llma.layers.79.attention.wk.weight: requires_grad False, local_size torch.Size([1024, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.850684] Param llma.layers.79.attention.wk.bias: requires_grad True, local_size torch.Size([1024]), model_parallel True, dtype torch.float32 +[01:45:45.850704] Param llma.layers.79.attention.wv.weight: requires_grad False, local_size torch.Size([1024, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.850716] Param llma.layers.79.attention.wv.bias: requires_grad True, local_size torch.Size([1024]), model_parallel True, dtype torch.float32 +[01:45:45.850729] Param llma.layers.79.attention.wo.weight: requires_grad False, local_size torch.Size([8192, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.850741] Param llma.layers.79.attention.wo.bias: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.850755] Param llma.layers.79.feed_forward.w1.weight: requires_grad False, local_size torch.Size([28672, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.850767] Param llma.layers.79.feed_forward.w1.bias: requires_grad True, local_size torch.Size([28672]), model_parallel True, dtype torch.float32 +[01:45:45.850780] Param llma.layers.79.feed_forward.w2.weight: requires_grad False, local_size torch.Size([8192, 28672]), model_parallel True, dtype torch.bfloat16 +[01:45:45.850792] Param llma.layers.79.feed_forward.w2.bias: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.850805] Param llma.layers.79.feed_forward.w3.weight: requires_grad False, local_size torch.Size([28672, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.850817] Param llma.layers.79.feed_forward.w3.bias: requires_grad True, local_size torch.Size([28672]), model_parallel True, dtype torch.float32 +[01:45:45.850830] Param llma.layers.79.attention_norm.weight: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.850842] Param llma.layers.79.ffn_norm.weight: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.850856] Param llma.norm.weight: requires_grad True, local_size torch.Size([8192]), model_parallel False, dtype torch.float32 +[01:45:45.850870] Param llma.output.weight: requires_grad False, local_size torch.Size([32000, 8192]), model_parallel True, dtype torch.bfloat16 +[01:45:45.850908] ## Load pretrained from ../checkpoints/llama2/Llama-2-70b/ +[01:48:36.775804] ## Quantizing model to 4bit! + Qunatization Process: 0%| | 0/561 [00:00 +[04:16:11.747956] Start training for 3 epochs +[04:16:11.757783] log_dir: ./output_dir +[04:16:20.562890] Epoch: [0] [0/3112] lr: 0.000000 closs: 1.2495 (1.2495) time: 8.8045 data: 1.6135 max mem: 55252 +[04:17:17.833703] Epoch: [0] [10/3112] lr: 0.000001 closs: 1.0279 (2.6928) grad_norm: 5.5482 (9.2752) time: 6.0067 data: 0.1469 max mem: 71323 +[04:18:15.203100] Epoch: [0] [20/3112] lr: 0.000003 closs: 0.9449 (2.3329) grad_norm: 9.9532 (22.1729) time: 5.7319 data: 0.0002 max mem: 71323 +[04:19:12.596899] Epoch: [0] [30/3112] lr: 0.000004 closs: 1.0569 (2.8670) grad_norm: 9.9532 (18.8243) time: 5.7381 data: 0.0002 max mem: 71323 +[04:20:09.945026] Epoch: [0] [40/3112] lr: 0.000006 closs: 1.0608 (2.6430) grad_norm: 9.9532 (16.3999) time: 5.7370 data: 0.0002 max mem: 71323 +[04:21:07.176403] Epoch: [0] [50/3112] lr: 0.000008 closs: 0.8017 (2.2753) grad_norm: 9.2899 (14.9817) time: 5.7289 data: 0.0002 max mem: 71323 +[04:22:04.575993] Epoch: [0] [60/3112] lr: 0.000010 closs: 0.8011 (2.3635) grad_norm: 9.2899 (13.6211) time: 5.7315 data: 0.0002 max mem: 71323 +[04:23:01.904531] Epoch: [0] [70/3112] lr: 0.000011 closs: 0.9501 (2.3969) grad_norm: 9.2899 (14.6349) time: 5.7363 data: 0.0002 max mem: 71323 +[04:23:59.066019] Epoch: [0] [80/3112] lr: 0.000013 closs: 0.9048 (2.2119) grad_norm: 8.7074 (13.5844) time: 5.7244 data: 0.0002 max mem: 71323 +[04:24:56.371592] Epoch: [0] [90/3112] lr: 0.000014 closs: 0.9325 (2.2345) grad_norm: 8.5086 (13.0566) time: 5.7233 data: 0.0002 max mem: 71323 +[04:25:53.675967] Epoch: [0] [100/3112] lr: 0.000016 closs: 0.9065 (2.1644) grad_norm: 7.7499 (12.2959) time: 5.7304 data: 0.0002 max mem: 71323 +[04:26:51.023277] Epoch: [0] [110/3112] lr: 0.000017 closs: 0.6248 (2.1151) grad_norm: 7.0736 (11.7787) time: 5.7325 data: 0.0002 max mem: 71323 +[04:27:48.457237] Epoch: [0] [120/3112] lr: 0.000019 closs: 0.6418 (2.0147) grad_norm: 7.0479 (11.1748) time: 5.7389 data: 0.0002 max mem: 71323 +[04:28:45.809674] Epoch: [0] [130/3112] lr: 0.000020 closs: 0.6392 (1.9427) grad_norm: 6.6030 (10.8302) time: 5.7392 data: 0.0002 max mem: 71323 +[04:29:43.259669] Epoch: [0] [140/3112] lr: 0.000020 closs: 0.6311 (1.9063) grad_norm: 6.3709 (10.4552) time: 5.7400 data: 0.0002 max mem: 71323 +[04:30:40.588082] Epoch: [0] [150/3112] lr: 0.000020 closs: 0.9006 (1.8538) grad_norm: 6.3709 (10.4020) time: 5.7388 data: 0.0002 max mem: 71323 +[04:31:37.952846] Epoch: [0] [160/3112] lr: 0.000020 closs: 0.9104 (1.8201) grad_norm: 6.3473 (10.0428) time: 5.7346 data: 0.0002 max mem: 71323 +[04:32:35.125560] Epoch: [0] [170/3112] lr: 0.000020 closs: 0.7512 (1.7575) grad_norm: 5.8198 (9.7499) time: 5.7268 data: 0.0002 max mem: 71323 +[04:33:32.383153] Epoch: [0] [180/3112] lr: 0.000020 closs: 0.8407 (1.7105) grad_norm: 4.9749 (9.2773) time: 5.7214 data: 0.0002 max mem: 71323 +[04:34:29.763966] Epoch: [0] [190/3112] lr: 0.000020 closs: 0.8542 (1.6575) grad_norm: 4.8554 (8.9987) time: 5.7318 data: 0.0002 max mem: 71323 +[04:35:27.229328] Epoch: [0] [200/3112] lr: 0.000020 closs: 0.6368 (1.6114) grad_norm: 4.5984 (8.6394) time: 5.7421 data: 0.0002 max mem: 71323 +[04:36:24.484571] Epoch: [0] [210/3112] lr: 0.000020 closs: 0.6368 (1.5767) grad_norm: 3.9001 (8.4014) time: 5.7359 data: 0.0002 max mem: 71323 +[04:37:21.748690] Epoch: [0] [220/3112] lr: 0.000020 closs: 0.7064 (1.5378) grad_norm: 3.1832 (8.0965) time: 5.7259 data: 0.0002 max mem: 71323 +[04:38:19.100962] Epoch: [0] [230/3112] lr: 0.000020 closs: 0.6201 (1.4931) grad_norm: 2.5783 (7.8905) time: 5.7307 data: 0.0002 max mem: 71323 +[04:39:16.339887] Epoch: [0] [240/3112] lr: 0.000020 closs: 0.6323 (1.4808) grad_norm: 2.4319 (7.6144) time: 5.7294 data: 0.0002 max mem: 71323 +[04:40:13.913080] Epoch: [0] [250/3112] lr: 0.000020 closs: 0.6323 (1.4464) grad_norm: 2.3918 (7.4538) time: 5.7404 data: 0.0002 max mem: 71323 +[04:41:11.386935] Epoch: [0] [260/3112] lr: 0.000020 closs: 0.4910 (1.4081) grad_norm: 2.3695 (7.2130) time: 5.7522 data: 0.0002 max mem: 71323 +[04:42:08.793583] Epoch: [0] [270/3112] lr: 0.000020 closs: 0.4910 (1.3840) grad_norm: 2.3695 (7.0772) time: 5.7439 data: 0.0002 max mem: 71323 +[04:43:06.195578] Epoch: [0] [280/3112] lr: 0.000020 closs: 0.6243 (1.3570) grad_norm: 2.3695 (6.8798) time: 5.7403 data: 0.0002 max mem: 71323 +[04:44:03.549386] Epoch: [0] [290/3112] lr: 0.000020 closs: 0.5797 (1.3303) grad_norm: 2.3517 (6.7607) time: 5.7377 data: 0.0002 max mem: 71323 +[04:45:00.771463] Epoch: [0] [300/3112] lr: 0.000020 closs: 0.5879 (1.3145) grad_norm: 2.3221 (6.5751) time: 5.7287 data: 0.0002 max mem: 71323 +[04:45:58.130460] Epoch: [0] [310/3112] lr: 0.000020 closs: 0.5763 (1.2890) grad_norm: 2.3193 (6.4571) time: 5.7289 data: 0.0002 max mem: 71323 +[04:46:55.484228] Epoch: [0] [320/3112] lr: 0.000020 closs: 0.4807 (1.2695) grad_norm: 2.3193 (6.3155) time: 5.7355 data: 0.0002 max mem: 71323 +[04:47:52.727815] Epoch: [0] [330/3112] lr: 0.000020 closs: 0.5266 (1.2584) grad_norm: 2.3193 (6.2144) time: 5.7298 data: 0.0002 max mem: 71323 +[04:48:50.008242] Epoch: [0] [340/3112] lr: 0.000020 closs: 0.5635 (1.2437) grad_norm: 2.7939 (6.1109) time: 5.7261 data: 0.0002 max mem: 71323 +[04:49:47.477904] Epoch: [0] [350/3112] lr: 0.000020 closs: 0.5614 (1.2256) grad_norm: 2.4241 (6.0234) time: 5.7374 data: 0.0002 max mem: 71323 +[04:50:44.813576] Epoch: [0] [360/3112] lr: 0.000020 closs: 0.5105 (1.2087) grad_norm: 2.4241 (5.9345) time: 5.7401 data: 0.0001 max mem: 71323 +[04:51:41.939318] Epoch: [0] [370/3112] lr: 0.000020 closs: 0.6744 (1.1950) grad_norm: 2.4241 (5.8797) time: 5.7229 data: 0.0002 max mem: 71323 +[04:52:39.305595] Epoch: [0] [380/3112] lr: 0.000020 closs: 0.4579 (1.1773) grad_norm: 2.8243 (5.7968) time: 5.7245 data: 0.0002 max mem: 71323 +[04:53:36.510153] Epoch: [0] [390/3112] lr: 0.000020 closs: 0.4579 (1.1657) grad_norm: 2.9991 (5.7550) time: 5.7284 data: 0.0002 max mem: 71323 +[04:54:33.832283] Epoch: [0] [400/3112] lr: 0.000020 closs: 0.5183 (1.1557) grad_norm: 2.9265 (5.6618) time: 5.7262 data: 0.0002 max mem: 71323 +[04:55:31.200308] Epoch: [0] [410/3112] lr: 0.000020 closs: 0.5562 (1.1437) grad_norm: 2.8121 (5.5942) time: 5.7343 data: 0.0002 max mem: 71323 +[04:56:28.507534] Epoch: [0] [420/3112] lr: 0.000020 closs: 0.5593 (1.1332) grad_norm: 2.6379 (5.5115) time: 5.7336 data: 0.0002 max mem: 71323 +[04:57:25.681611] Epoch: [0] [430/3112] lr: 0.000020 closs: 0.7076 (1.1280) grad_norm: 2.6379 (5.4440) time: 5.7239 data: 0.0002 max mem: 71323 +[04:58:23.215662] Epoch: [0] [440/3112] lr: 0.000020 closs: 0.7227 (1.1221) grad_norm: 2.5528 (5.3553) time: 5.7353 data: 0.0002 max mem: 71323 +[04:59:20.456400] Epoch: [0] [450/3112] lr: 0.000020 closs: 0.7610 (1.1173) grad_norm: 2.5515 (5.2968) time: 5.7386 data: 0.0002 max mem: 71323 +[05:00:17.596044] Epoch: [0] [460/3112] lr: 0.000020 closs: 0.5390 (1.1025) grad_norm: 2.5495 (5.2281) time: 5.7189 data: 0.0002 max mem: 71323 +[05:01:14.938479] Epoch: [0] [470/3112] lr: 0.000020 closs: 0.5235 (1.0927) grad_norm: 2.4580 (5.1840) time: 5.7239 data: 0.0002 max mem: 71323 +[05:02:12.201913] Epoch: [0] [480/3112] lr: 0.000020 closs: 0.6515 (1.0827) grad_norm: 2.3921 (5.1050) time: 5.7301 data: 0.0002 max mem: 71323 +[05:03:09.617529] Epoch: [0] [490/3112] lr: 0.000020 closs: 0.6926 (1.0736) grad_norm: 2.4580 (5.0583) time: 5.7338 data: 0.0002 max mem: 71323 +[05:04:06.852672] Epoch: [0] [500/3112] lr: 0.000020 closs: 0.5815 (1.0672) grad_norm: 2.2241 (4.9864) time: 5.7324 data: 0.0002 max mem: 71323 +[05:05:04.360450] Epoch: [0] [510/3112] lr: 0.000020 closs: 0.6993 (1.0597) grad_norm: 2.3682 (4.9514) time: 5.7370 data: 0.0002 max mem: 71323 +[05:06:01.775857] Epoch: [0] [520/3112] lr: 0.000020 closs: 0.6312 (1.0530) grad_norm: 2.4580 (4.9165) time: 5.7460 data: 0.0002 max mem: 71323 +[05:06:59.033364] Epoch: [0] [530/3112] lr: 0.000020 closs: 0.5645 (1.0478) grad_norm: 2.5495 (4.8847) time: 5.7335 data: 0.0002 max mem: 71323 +[05:07:56.247058] Epoch: [0] [540/3112] lr: 0.000020 closs: 0.7086 (1.0459) grad_norm: 2.4209 (4.8313) time: 5.7234 data: 0.0002 max mem: 71323 +[05:08:53.518581] Epoch: [0] [550/3112] lr: 0.000020 closs: 0.6712 (1.0407) grad_norm: 2.4209 (4.8001) time: 5.7241 data: 0.0002 max mem: 71323 +[05:09:50.801208] Epoch: [0] [560/3112] lr: 0.000020 closs: 0.7142 (1.0343) grad_norm: 2.4209 (4.7580) time: 5.7276 data: 0.0002 max mem: 71323 +[05:10:48.261756] Epoch: [0] [570/3112] lr: 0.000020 closs: 0.7438 (1.0352) grad_norm: 2.4209 (4.7430) time: 5.7370 data: 0.0002 max mem: 71323 +[05:11:45.483854] Epoch: [0] [580/3112] lr: 0.000020 closs: 0.4998 (1.0235) grad_norm: 2.4209 (4.7112) time: 5.7339 data: 0.0002 max mem: 71323 +[05:12:42.839052] Epoch: [0] [590/3112] lr: 0.000020 closs: 0.4998 (1.0169) grad_norm: 2.4209 (4.6832) time: 5.7287 data: 0.0002 max mem: 71323 +[05:13:40.220454] Epoch: [0] [600/3112] lr: 0.000020 closs: 0.5005 (1.0070) grad_norm: 2.5371 (4.6534) time: 5.7367 data: 0.0002 max mem: 71323 +[05:14:37.514407] Epoch: [0] [610/3112] lr: 0.000020 closs: 0.4686 (1.0011) grad_norm: 2.5371 (4.6324) time: 5.7336 data: 0.0002 max mem: 71323 +[05:15:34.833577] Epoch: [0] [620/3112] lr: 0.000020 closs: 0.7522 (0.9984) grad_norm: 2.3765 (4.5812) time: 5.7306 data: 0.0002 max mem: 71323 +[05:16:31.947792] Epoch: [0] [630/3112] lr: 0.000020 closs: 0.7057 (0.9936) grad_norm: 2.5371 (4.5582) time: 5.7216 data: 0.0002 max mem: 71323 +[05:17:29.271430] Epoch: [0] [640/3112] lr: 0.000020 closs: 0.5913 (0.9843) grad_norm: 2.6371 (4.5592) time: 5.7218 data: 0.0002 max mem: 71323 +[05:18:26.702668] Epoch: [0] [650/3112] lr: 0.000020 closs: 0.3504 (0.9765) grad_norm: 2.6326 (4.5215) time: 5.7376 data: 0.0002 max mem: 71323 +[05:19:23.954745] Epoch: [0] [660/3112] lr: 0.000020 closs: 0.5721 (0.9735) grad_norm: 2.5897 (4.4813) time: 5.7340 data: 0.0002 max mem: 71323 +[05:20:21.213191] Epoch: [0] [670/3112] lr: 0.000020 closs: 0.6470 (0.9683) grad_norm: 2.5372 (4.4492) time: 5.7254 data: 0.0002 max mem: 71323 +[05:21:18.704973] Epoch: [0] [680/3112] lr: 0.000020 closs: 0.6106 (0.9625) grad_norm: 2.5372 (4.4225) time: 5.7374 data: 0.0002 max mem: 71323 +[05:22:15.962228] Epoch: [0] [690/3112] lr: 0.000020 closs: 0.6106 (0.9576) grad_norm: 2.5372 (4.3971) time: 5.7373 data: 0.0002 max mem: 71323 +[05:23:13.315112] Epoch: [0] [700/3112] lr: 0.000020 closs: 0.6877 (0.9540) grad_norm: 2.5897 (4.3731) time: 5.7303 data: 0.0002 max mem: 71323 +[05:24:10.732903] Epoch: [0] [710/3112] lr: 0.000020 closs: 0.6877 (0.9565) grad_norm: 2.5897 (4.3769) time: 5.7384 data: 0.0002 max mem: 71323 +[05:25:07.931208] Epoch: [0] [720/3112] lr: 0.000020 closs: 0.6663 (0.9511) grad_norm: 2.5372 (4.3616) time: 5.7306 data: 0.0002 max mem: 71323 +[05:26:05.349971] Epoch: [0] [730/3112] lr: 0.000020 closs: 0.5524 (0.9460) grad_norm: 2.6416 (4.3478) time: 5.7307 data: 0.0002 max mem: 71323 +[05:27:02.724423] Epoch: [0] [740/3112] lr: 0.000020 closs: 0.6319 (0.9427) grad_norm: 2.6586 (4.3139) time: 5.7395 data: 0.0002 max mem: 71323 +[05:28:00.235723] Epoch: [0] [750/3112] lr: 0.000020 closs: 0.6103 (0.9400) grad_norm: 2.9443 (4.3329) time: 5.7441 data: 0.0002 max mem: 71323 +[05:28:57.527829] Epoch: [0] [760/3112] lr: 0.000020 closs: 0.4909 (0.9337) grad_norm: 2.9983 (4.4548) time: 5.7400 data: 0.0002 max mem: 71323 +[05:29:54.846989] Epoch: [0] [770/3112] lr: 0.000020 closs: 0.3275 (0.9280) grad_norm: 2.9443 (4.4280) time: 5.7304 data: 0.0002 max mem: 71323 +[05:30:52.045770] Epoch: [0] [780/3112] lr: 0.000020 closs: 0.5112 (0.9226) grad_norm: 2.9983 (4.4069) time: 5.7257 data: 0.0002 max mem: 71323 +[05:31:49.451637] Epoch: [0] [790/3112] lr: 0.000020 closs: 0.4526 (0.9180) grad_norm: 2.7942 (4.3883) time: 5.7301 data: 0.0002 max mem: 71323 +[05:32:46.798182] Epoch: [0] [800/3112] lr: 0.000020 closs: 0.4526 (0.9125) grad_norm: 2.7942 (4.3611) time: 5.7375 data: 0.0002 max mem: 71323 +[05:33:44.157779] Epoch: [0] [810/3112] lr: 0.000020 closs: 0.5629 (0.9100) grad_norm: 2.3475 (4.3385) time: 5.7352 data: 0.0002 max mem: 71323 +[05:34:41.357523] Epoch: [0] [820/3112] lr: 0.000020 closs: 0.4988 (0.9044) grad_norm: 2.3409 (4.2992) time: 5.7278 data: 0.0002 max mem: 71323 +[05:35:38.834862] Epoch: [0] [830/3112] lr: 0.000020 closs: 0.4988 (0.9037) grad_norm: 2.3119 (4.2856) time: 5.7337 data: 0.0002 max mem: 71323 +[05:36:36.198511] Epoch: [0] [840/3112] lr: 0.000020 closs: 0.6597 (0.9018) grad_norm: 2.2993 (4.2599) time: 5.7419 data: 0.0002 max mem: 71323 +[05:37:33.617151] Epoch: [0] [850/3112] lr: 0.000020 closs: 0.5867 (0.8992) grad_norm: 2.3119 (4.2420) time: 5.7390 data: 0.0002 max mem: 71323 +[05:38:30.751502] Epoch: [0] [860/3112] lr: 0.000020 closs: 0.5979 (0.8961) grad_norm: 2.2564 (4.2105) time: 5.7275 data: 0.0002 max mem: 71323 +[05:39:28.097655] Epoch: [0] [870/3112] lr: 0.000020 closs: 0.5979 (0.8936) grad_norm: 2.2564 (4.1993) time: 5.7239 data: 0.0002 max mem: 71323 +[05:40:25.340221] Epoch: [0] [880/3112] lr: 0.000020 closs: 0.6262 (0.8919) grad_norm: 2.1552 (4.1807) time: 5.7293 data: 0.0002 max mem: 71323 +[05:41:22.549652] Epoch: [0] [890/3112] lr: 0.000020 closs: 0.6839 (0.8900) grad_norm: 2.2993 (4.1672) time: 5.7225 data: 0.0002 max mem: 71323 +[05:42:19.913767] Epoch: [0] [900/3112] lr: 0.000020 closs: 0.4624 (0.8863) grad_norm: 2.5604 (4.1599) time: 5.7286 data: 0.0002 max mem: 71323 +[05:43:17.290374] Epoch: [0] [910/3112] lr: 0.000020 closs: 0.4624 (0.8844) grad_norm: 2.6194 (4.1849) time: 5.7368 data: 0.0002 max mem: 71323 +[05:44:14.524017] Epoch: [0] [920/3112] lr: 0.000020 closs: 0.5317 (0.8820) grad_norm: 2.7321 (4.1924) time: 5.7303 data: 0.0002 max mem: 71323 +[05:45:11.968923] Epoch: [0] [930/3112] lr: 0.000020 closs: 0.5317 (0.8802) grad_norm: 2.7321 (4.1727) time: 5.7338 data: 0.0002 max mem: 71323 +[05:46:09.253479] Epoch: [0] [940/3112] lr: 0.000020 closs: 0.4783 (0.8768) grad_norm: 2.7321 (4.1470) time: 5.7364 data: 0.0002 max mem: 71323 +[05:47:06.560415] Epoch: [0] [950/3112] lr: 0.000020 closs: 0.4554 (0.8722) grad_norm: 2.7321 (4.1561) time: 5.7294 data: 0.0002 max mem: 71323 +[05:48:03.711139] Epoch: [0] [960/3112] lr: 0.000020 closs: 0.4945 (0.8696) grad_norm: 2.8167 (4.1487) time: 5.7227 data: 0.0002 max mem: 71323 +[05:49:00.800861] Epoch: [0] [970/3112] lr: 0.000020 closs: 0.4945 (0.8659) grad_norm: 2.7321 (4.1342) time: 5.7118 data: 0.0002 max mem: 71323 +[05:49:58.006531] Epoch: [0] [980/3112] lr: 0.000020 closs: 0.4473 (0.8631) grad_norm: 2.7238 (4.1182) time: 5.7146 data: 0.0002 max mem: 71323 +[05:50:55.439710] Epoch: [0] [990/3112] lr: 0.000020 closs: 0.5714 (0.8609) grad_norm: 2.5582 (4.1133) time: 5.7318 data: 0.0002 max mem: 71323 +[05:51:52.860038] Epoch: [0] [1000/3112] lr: 0.000020 closs: 0.6348 (0.8608) grad_norm: 2.4464 (4.0930) time: 5.7425 data: 0.0002 max mem: 71323 +[05:52:50.242475] Epoch: [0] [1010/3112] lr: 0.000020 closs: 0.6348 (0.8582) grad_norm: 2.7238 (4.0881) time: 5.7400 data: 0.0002 max mem: 71323 +[05:53:47.459212] Epoch: [0] [1020/3112] lr: 0.000020 closs: 0.5085 (0.8559) grad_norm: 2.7238 (4.0743) time: 5.7299 data: 0.0002 max mem: 71323 +[05:54:44.793282] Epoch: [0] [1030/3112] lr: 0.000020 closs: 0.5058 (0.8535) grad_norm: 2.4464 (4.0599) time: 5.7274 data: 0.0002 max mem: 71323 +[05:55:41.992415] Epoch: [0] [1040/3112] lr: 0.000020 closs: 0.5126 (0.8507) grad_norm: 2.4464 (4.0740) time: 5.7265 data: 0.0002 max mem: 71323 +[05:56:39.258581] Epoch: [0] [1050/3112] lr: 0.000020 closs: 0.5311 (0.8484) grad_norm: 2.4542 (4.0747) time: 5.7231 data: 0.0002 max mem: 71323 +[05:57:36.445676] Epoch: [0] [1060/3112] lr: 0.000019 closs: 0.5625 (0.8481) grad_norm: 2.4542 (4.0732) time: 5.7225 data: 0.0002 max mem: 71323 +[05:58:33.779989] Epoch: [0] [1070/3112] lr: 0.000019 closs: 0.5274 (0.8445) grad_norm: 2.4542 (4.0717) time: 5.7259 data: 0.0002 max mem: 71323 +[05:59:30.940723] Epoch: [0] [1080/3112] lr: 0.000019 closs: 0.5226 (0.8447) grad_norm: 2.4542 (4.0562) time: 5.7246 data: 0.0002 max mem: 71323 +[06:00:28.107715] Epoch: [0] [1090/3112] lr: 0.000019 closs: 0.6270 (0.8450) grad_norm: 2.4542 (4.0462) time: 5.7163 data: 0.0002 max mem: 71323 +[06:01:25.483216] Epoch: [0] [1100/3112] lr: 0.000019 closs: 0.6270 (0.8439) grad_norm: 2.4295 (4.0251) time: 5.7270 data: 0.0002 max mem: 71323 +[06:02:22.841987] Epoch: [0] [1110/3112] lr: 0.000019 closs: 0.5958 (0.8413) grad_norm: 2.4542 (4.0191) time: 5.7365 data: 0.0002 max mem: 71323 +[06:03:20.386487] Epoch: [0] [1120/3112] lr: 0.000019 closs: 0.4872 (0.8382) grad_norm: 2.3856 (3.9980) time: 5.7450 data: 0.0002 max mem: 71323 +[06:04:17.671583] Epoch: [0] [1130/3112] lr: 0.000019 closs: 0.5300 (0.8368) grad_norm: 2.2857 (3.9864) time: 5.7414 data: 0.0002 max mem: 71323 +[06:05:15.017762] Epoch: [0] [1140/3112] lr: 0.000019 closs: 0.5631 (0.8343) grad_norm: 2.3856 (3.9695) time: 5.7315 data: 0.0002 max mem: 71323 +[06:06:12.290479] Epoch: [0] [1150/3112] lr: 0.000019 closs: 0.5549 (0.8329) grad_norm: 2.2244 (3.9502) time: 5.7308 data: 0.0002 max mem: 71323 +[06:07:09.519325] Epoch: [0] [1160/3112] lr: 0.000019 closs: 0.5549 (0.8320) grad_norm: 2.3856 (3.9492) time: 5.7249 data: 0.0002 max mem: 71323 +[06:08:06.733028] Epoch: [0] [1170/3112] lr: 0.000019 closs: 0.4562 (0.8294) grad_norm: 2.1655 (3.9393) time: 5.7220 data: 0.0002 max mem: 71323 +[06:09:03.932280] Epoch: [0] [1180/3112] lr: 0.000019 closs: 0.5481 (0.8276) grad_norm: 2.3856 (3.9333) time: 5.7205 data: 0.0002 max mem: 71323 +[06:10:01.053954] Epoch: [0] [1190/3112] lr: 0.000019 closs: 0.6016 (0.8275) grad_norm: 2.3880 (3.9390) time: 5.7159 data: 0.0002 max mem: 71323 +[06:10:58.319614] Epoch: [0] [1200/3112] lr: 0.000019 closs: 0.3608 (0.8244) grad_norm: 2.5395 (3.9326) time: 5.7193 data: 0.0002 max mem: 71323 +[06:11:55.738856] Epoch: [0] [1210/3112] lr: 0.000019 closs: 0.3788 (0.8225) grad_norm: 2.6872 (3.9370) time: 5.7342 data: 0.0002 max mem: 71323 +[06:12:52.934490] Epoch: [0] [1220/3112] lr: 0.000019 closs: 0.4301 (0.8196) grad_norm: 2.6872 (3.9208) time: 5.7306 data: 0.0002 max mem: 71323 +[06:13:50.221943] Epoch: [0] [1230/3112] lr: 0.000019 closs: 0.4900 (0.8177) grad_norm: 3.1796 (3.9161) time: 5.7240 data: 0.0002 max mem: 71323 +[06:14:47.491654] Epoch: [0] [1240/3112] lr: 0.000019 closs: 0.5376 (0.8164) grad_norm: 3.1796 (3.9156) time: 5.7277 data: 0.0002 max mem: 71323 +[06:15:44.821158] Epoch: [0] [1250/3112] lr: 0.000019 closs: 0.6686 (0.8158) grad_norm: 3.1796 (3.9105) time: 5.7298 data: 0.0002 max mem: 71323 +[06:16:42.120046] Epoch: [0] [1260/3112] lr: 0.000019 closs: 0.6691 (0.8143) grad_norm: 2.8718 (3.8922) time: 5.7313 data: 0.0002 max mem: 71323 +[06:17:39.322626] Epoch: [0] [1270/3112] lr: 0.000019 closs: 0.5977 (0.8127) grad_norm: 2.6040 (3.8900) time: 5.7249 data: 0.0002 max mem: 71323 +[06:18:36.542903] Epoch: [0] [1280/3112] lr: 0.000019 closs: 0.4713 (0.8104) grad_norm: 2.5307 (3.8831) time: 5.7211 data: 0.0002 max mem: 71323 +[06:19:33.683038] Epoch: [0] [1290/3112] lr: 0.000019 closs: 0.5683 (0.8098) grad_norm: 2.5307 (3.8736) time: 5.7179 data: 0.0002 max mem: 71323 +[06:20:30.892664] Epoch: [0] [1300/3112] lr: 0.000019 closs: 0.6848 (0.8086) grad_norm: 2.6040 (3.8702) time: 5.7174 data: 0.0002 max mem: 71323 +[06:21:28.325053] Epoch: [0] [1310/3112] lr: 0.000019 closs: 0.7400 (0.8070) grad_norm: 2.5307 (3.8547) time: 5.7320 data: 0.0002 max mem: 71323 +[06:22:25.796944] Epoch: [0] [1320/3112] lr: 0.000019 closs: 0.5553 (0.8052) grad_norm: 2.2311 (3.8372) time: 5.7451 data: 0.0002 max mem: 71323 +[06:23:23.060187] Epoch: [0] [1330/3112] lr: 0.000019 closs: 0.5473 (0.8046) grad_norm: 2.2078 (3.8300) time: 5.7366 data: 0.0002 max mem: 71323 +[06:24:20.411375] Epoch: [0] [1340/3112] lr: 0.000019 closs: 0.5773 (0.8038) grad_norm: 2.1947 (3.8142) time: 5.7305 data: 0.0002 max mem: 71323 +[06:25:17.604299] Epoch: [0] [1350/3112] lr: 0.000019 closs: 0.5395 (0.8021) grad_norm: 2.2311 (3.8074) time: 5.7270 data: 0.0002 max mem: 71323 +[06:26:14.844697] Epoch: [0] [1360/3112] lr: 0.000019 closs: 0.4502 (0.8012) grad_norm: 2.1239 (3.7950) time: 5.7215 data: 0.0002 max mem: 71323 +[06:27:12.086839] Epoch: [0] [1370/3112] lr: 0.000019 closs: 0.7564 (0.8010) grad_norm: 2.2311 (3.7896) time: 5.7239 data: 0.0002 max mem: 71323 +[06:28:09.261435] Epoch: [0] [1380/3112] lr: 0.000019 closs: 0.6765 (0.7996) grad_norm: 2.2311 (3.7788) time: 5.7206 data: 0.0002 max mem: 71323 +[06:29:06.477833] Epoch: [0] [1390/3112] lr: 0.000019 closs: 0.5400 (0.7978) grad_norm: 2.2784 (3.7709) time: 5.7194 data: 0.0002 max mem: 71323 +[06:30:03.639621] Epoch: [0] [1400/3112] lr: 0.000019 closs: 0.4780 (0.7954) grad_norm: 2.5524 (3.7785) time: 5.7187 data: 0.0002 max mem: 71323 +[06:31:00.831154] Epoch: [0] [1410/3112] lr: 0.000019 closs: 0.4043 (0.7927) grad_norm: 2.5524 (3.7734) time: 5.7175 data: 0.0002 max mem: 71323 +[06:31:58.143544] Epoch: [0] [1420/3112] lr: 0.000019 closs: 0.4169 (0.7904) grad_norm: 2.6732 (3.7719) time: 5.7251 data: 0.0002 max mem: 71323 +[06:32:55.396868] Epoch: [0] [1430/3112] lr: 0.000019 closs: 0.5712 (0.7901) grad_norm: 2.6732 (3.7701) time: 5.7282 data: 0.0002 max mem: 71323 +[06:33:52.655664] Epoch: [0] [1440/3112] lr: 0.000019 closs: 0.5712 (0.7894) grad_norm: 2.6693 (3.7571) time: 5.7254 data: 0.0002 max mem: 71323 +[06:34:49.977905] Epoch: [0] [1450/3112] lr: 0.000019 closs: 0.5311 (0.7885) grad_norm: 2.7060 (3.7515) time: 5.7289 data: 0.0002 max mem: 71323 +[06:35:47.191414] Epoch: [0] [1460/3112] lr: 0.000019 closs: 0.5186 (0.7875) grad_norm: 2.7060 (3.7477) time: 5.7266 data: 0.0002 max mem: 71323 +[06:36:44.526351] Epoch: [0] [1470/3112] lr: 0.000019 closs: 0.5398 (0.7867) grad_norm: 2.7060 (3.7445) time: 5.7273 data: 0.0002 max mem: 71323 +[06:37:41.818097] Epoch: [0] [1480/3112] lr: 0.000019 closs: 0.5919 (0.7853) grad_norm: 2.7060 (3.7423) time: 5.7312 data: 0.0002 max mem: 71323 +[06:38:38.929895] Epoch: [0] [1490/3112] lr: 0.000019 closs: 0.4855 (0.7841) grad_norm: 2.7060 (3.7351) time: 5.7201 data: 0.0002 max mem: 71323 +[06:39:36.156676] Epoch: [0] [1500/3112] lr: 0.000019 closs: 0.4855 (0.7821) grad_norm: 2.6556 (3.7255) time: 5.7168 data: 0.0002 max mem: 71323 +[06:40:33.240674] Epoch: [0] [1510/3112] lr: 0.000019 closs: 0.5473 (0.7819) grad_norm: 2.6556 (3.7371) time: 5.7154 data: 0.0001 max mem: 71323 +[06:41:30.391827] Epoch: [0] [1520/3112] lr: 0.000019 closs: 0.7923 (0.7828) grad_norm: 2.8088 (3.7370) time: 5.7116 data: 0.0002 max mem: 71323 +[06:42:27.638409] Epoch: [0] [1530/3112] lr: 0.000019 closs: 0.5792 (0.7807) grad_norm: 2.9563 (3.7401) time: 5.7197 data: 0.0002 max mem: 71323 +[06:43:24.837859] Epoch: [0] [1540/3112] lr: 0.000019 closs: 0.5018 (0.7796) grad_norm: 2.9563 (3.7377) time: 5.7221 data: 0.0002 max mem: 71323 +[06:44:22.044722] Epoch: [0] [1550/3112] lr: 0.000019 closs: 0.5571 (0.7780) grad_norm: 2.9488 (3.7302) time: 5.7201 data: 0.0002 max mem: 71323 +[06:45:19.510177] Epoch: [0] [1560/3112] lr: 0.000019 closs: 0.5122 (0.7767) grad_norm: 2.7489 (3.7216) time: 5.7334 data: 0.0002 max mem: 71323 +[06:46:16.866863] Epoch: [0] [1570/3112] lr: 0.000019 closs: 0.3836 (0.7747) grad_norm: 2.9488 (3.7246) time: 5.7410 data: 0.0002 max mem: 71323 +[06:47:14.202898] Epoch: [0] [1580/3112] lr: 0.000019 closs: 0.3950 (0.7737) grad_norm: 3.2680 (3.7218) time: 5.7345 data: 0.0002 max mem: 71323 +[06:48:11.432298] Epoch: [0] [1590/3112] lr: 0.000019 closs: 0.4247 (0.7717) grad_norm: 2.9563 (3.7154) time: 5.7281 data: 0.0002 max mem: 71323 +[06:49:08.626898] Epoch: [0] [1600/3112] lr: 0.000019 closs: 0.3819 (0.7703) grad_norm: 2.8225 (3.7091) time: 5.7211 data: 0.0002 max mem: 71323 +[06:50:05.850837] Epoch: [0] [1610/3112] lr: 0.000019 closs: 0.5528 (0.7697) grad_norm: 2.5235 (3.6995) time: 5.7208 data: 0.0002 max mem: 71323 +[06:51:03.102285] Epoch: [0] [1620/3112] lr: 0.000019 closs: 0.5908 (0.7682) grad_norm: 2.8225 (3.7088) time: 5.7236 data: 0.0002 max mem: 71323 +[06:52:00.398118] Epoch: [0] [1630/3112] lr: 0.000019 closs: 0.4178 (0.7657) grad_norm: 3.1036 (3.7059) time: 5.7272 data: 0.0002 max mem: 71323 +[06:52:57.725493] Epoch: [0] [1640/3112] lr: 0.000019 closs: 0.4797 (0.7641) grad_norm: 2.8225 (3.6934) time: 5.7310 data: 0.0002 max mem: 71323 +[06:53:54.984072] Epoch: [0] [1650/3112] lr: 0.000019 closs: 0.5062 (0.7623) grad_norm: 2.2510 (3.6851) time: 5.7292 data: 0.0002 max mem: 71323 +[06:54:52.176037] Epoch: [0] [1660/3112] lr: 0.000019 closs: 0.5231 (0.7629) grad_norm: 2.2510 (3.6873) time: 5.7224 data: 0.0001 max mem: 71323 +[06:55:49.431752] Epoch: [0] [1670/3112] lr: 0.000019 closs: 0.4355 (0.7611) grad_norm: 2.3797 (3.6910) time: 5.7222 data: 0.0002 max mem: 71323 +[06:56:46.482096] Epoch: [0] [1680/3112] lr: 0.000019 closs: 0.3685 (0.7592) grad_norm: 2.2510 (3.6842) time: 5.7151 data: 0.0002 max mem: 71323 +[06:57:43.766728] Epoch: [0] [1690/3112] lr: 0.000019 closs: 0.4530 (0.7585) grad_norm: 2.7506 (3.6928) time: 5.7166 data: 0.0002 max mem: 71323 +[06:58:41.108103] Epoch: [0] [1700/3112] lr: 0.000019 closs: 0.4830 (0.7562) grad_norm: 2.5801 (3.7039) time: 5.7312 data: 0.0002 max mem: 71323 +[06:59:38.255671] Epoch: [0] [1710/3112] lr: 0.000019 closs: 0.5038 (0.7561) grad_norm: 2.5149 (3.6995) time: 5.7243 data: 0.0002 max mem: 71323 +[07:00:35.563634] Epoch: [0] [1720/3112] lr: 0.000019 closs: 0.5038 (0.7552) grad_norm: 2.7506 (3.6976) time: 5.7227 data: 0.0002 max mem: 71323 +[07:01:33.052716] Epoch: [0] [1730/3112] lr: 0.000019 closs: 0.4345 (0.7551) grad_norm: 2.9848 (3.7056) time: 5.7398 data: 0.0002 max mem: 71323 +[07:02:30.297288] Epoch: [0] [1740/3112] lr: 0.000019 closs: 0.5458 (0.7543) grad_norm: 3.0247 (3.7057) time: 5.7366 data: 0.0002 max mem: 71323 +[07:03:27.555753] Epoch: [0] [1750/3112] lr: 0.000019 closs: 0.5458 (0.7532) grad_norm: 2.7506 (3.6991) time: 5.7250 data: 0.0002 max mem: 71323 +[07:04:24.886564] Epoch: [0] [1760/3112] lr: 0.000018 closs: 0.5863 (0.7530) grad_norm: 2.7506 (3.6966) time: 5.7294 data: 0.0002 max mem: 71323 +[07:05:22.103669] Epoch: [0] [1770/3112] lr: 0.000018 closs: 0.5898 (0.7515) grad_norm: 2.5149 (3.6965) time: 5.7273 data: 0.0002 max mem: 71323 +[07:06:19.490222] Epoch: [0] [1780/3112] lr: 0.000018 closs: 0.5326 (0.7503) grad_norm: 2.9848 (3.6956) time: 5.7300 data: 0.0002 max mem: 71323 +[07:07:16.926521] Epoch: [0] [1790/3112] lr: 0.000018 closs: 0.3045 (0.7494) grad_norm: 3.0114 (3.6924) time: 5.7410 data: 0.0002 max mem: 71323 +[07:08:14.240513] Epoch: [0] [1800/3112] lr: 0.000018 closs: 0.4717 (0.7490) grad_norm: 2.7588 (3.6831) time: 5.7373 data: 0.0002 max mem: 71323 +[07:09:11.405657] Epoch: [0] [1810/3112] lr: 0.000018 closs: 0.5537 (0.7477) grad_norm: 2.7588 (3.6805) time: 5.7238 data: 0.0001 max mem: 71323 +[07:10:08.645254] Epoch: [0] [1820/3112] lr: 0.000018 closs: 0.5537 (0.7463) grad_norm: 2.9282 (3.6790) time: 5.7201 data: 0.0002 max mem: 71323 +[07:11:05.896495] Epoch: [0] [1830/3112] lr: 0.000018 closs: 0.6142 (0.7464) grad_norm: 2.9452 (3.6798) time: 5.7244 data: 0.0002 max mem: 71323 +[07:12:03.038624] Epoch: [0] [1840/3112] lr: 0.000018 closs: 0.6043 (0.7454) grad_norm: 2.9452 (3.6702) time: 5.7195 data: 0.0002 max mem: 71323 +[07:13:00.432614] Epoch: [0] [1850/3112] lr: 0.000018 closs: 0.4751 (0.7445) grad_norm: 2.9282 (3.6636) time: 5.7266 data: 0.0002 max mem: 71323 +[07:13:57.586472] Epoch: [0] [1860/3112] lr: 0.000018 closs: 0.4907 (0.7435) grad_norm: 2.7588 (3.6635) time: 5.7272 data: 0.0002 max mem: 71323 +[07:14:54.663032] Epoch: [0] [1870/3112] lr: 0.000018 closs: 0.4907 (0.7434) grad_norm: 2.6462 (3.6601) time: 5.7114 data: 0.0002 max mem: 71323 +[07:15:51.791312] Epoch: [0] [1880/3112] lr: 0.000018 closs: 0.3721 (0.7413) grad_norm: 2.6462 (3.6477) time: 5.7101 data: 0.0002 max mem: 71323 +[07:16:49.037887] Epoch: [0] [1890/3112] lr: 0.000018 closs: 0.3982 (0.7415) grad_norm: 2.7741 (3.6476) time: 5.7186 data: 0.0002 max mem: 71323 +[07:17:46.307802] Epoch: [0] [1900/3112] lr: 0.000018 closs: 0.5617 (0.7406) grad_norm: 2.6462 (3.6436) time: 5.7257 data: 0.0002 max mem: 71323 +[07:18:43.615330] Epoch: [0] [1910/3112] lr: 0.000018 closs: 0.4365 (0.7391) grad_norm: 2.7741 (3.6429) time: 5.7287 data: 0.0002 max mem: 71323 +[07:19:40.815841] Epoch: [0] [1920/3112] lr: 0.000018 closs: 0.4365 (0.7385) grad_norm: 2.7741 (3.6414) time: 5.7252 data: 0.0002 max mem: 71323 +[07:20:38.037830] Epoch: [0] [1930/3112] lr: 0.000018 closs: 0.4888 (0.7370) grad_norm: 2.9515 (3.6385) time: 5.7210 data: 0.0002 max mem: 71323 +[07:21:35.320321] Epoch: [0] [1940/3112] lr: 0.000018 closs: 0.4916 (0.7363) grad_norm: 2.9515 (3.6338) time: 5.7250 data: 0.0002 max mem: 71323 +[07:22:32.609933] Epoch: [0] [1950/3112] lr: 0.000018 closs: 0.4592 (0.7353) grad_norm: 2.9515 (3.6337) time: 5.7284 data: 0.0002 max mem: 71323 +[07:23:30.048719] Epoch: [0] [1960/3112] lr: 0.000018 closs: 0.6580 (0.7357) grad_norm: 3.0623 (3.6343) time: 5.7362 data: 0.0002 max mem: 71323 +[07:24:27.169904] Epoch: [0] [1970/3112] lr: 0.000018 closs: 0.5419 (0.7346) grad_norm: 3.0623 (3.6350) time: 5.7278 data: 0.0002 max mem: 71323 +[07:25:24.510536] Epoch: [0] [1980/3112] lr: 0.000018 closs: 0.5004 (0.7335) grad_norm: 3.0313 (3.6250) time: 5.7229 data: 0.0001 max mem: 71323 +[07:26:21.819608] Epoch: [0] [1990/3112] lr: 0.000018 closs: 0.5335 (0.7334) grad_norm: 2.8518 (3.6233) time: 5.7323 data: 0.0001 max mem: 71323 +[07:27:19.245696] Epoch: [0] [2000/3112] lr: 0.000018 closs: 0.5335 (0.7333) grad_norm: 2.8478 (3.6122) time: 5.7366 data: 0.0002 max mem: 71323 +[07:28:16.638958] Epoch: [0] [2010/3112] lr: 0.000018 closs: 0.6057 (0.7327) grad_norm: 2.8478 (3.6108) time: 5.7408 data: 0.0002 max mem: 71323 +[07:29:13.963526] Epoch: [0] [2020/3112] lr: 0.000018 closs: 0.6270 (0.7336) grad_norm: 2.7283 (3.6035) time: 5.7357 data: 0.0002 max mem: 71323 +[07:30:11.182574] Epoch: [0] [2030/3112] lr: 0.000018 closs: 0.8382 (0.7341) grad_norm: 2.7283 (3.5991) time: 5.7270 data: 0.0002 max mem: 71323 +[07:31:08.336234] Epoch: [0] [2040/3112] lr: 0.000018 closs: 0.7897 (0.7356) grad_norm: 2.5445 (3.5938) time: 5.7185 data: 0.0002 max mem: 71323 +[07:32:05.508705] Epoch: [0] [2050/3112] lr: 0.000018 closs: 0.7610 (0.7356) grad_norm: 2.4342 (3.5934) time: 5.7161 data: 0.0002 max mem: 71323 +[07:33:02.743547] Epoch: [0] [2060/3112] lr: 0.000018 closs: 0.5552 (0.7352) grad_norm: 2.5783 (3.5969) time: 5.7202 data: 0.0002 max mem: 71323 +[07:33:59.840692] Epoch: [0] [2070/3112] lr: 0.000018 closs: 0.4490 (0.7337) grad_norm: 2.5783 (3.5967) time: 5.7164 data: 0.0002 max mem: 71323 +[07:34:57.130667] Epoch: [0] [2080/3112] lr: 0.000018 closs: 0.4523 (0.7331) grad_norm: 3.0582 (3.6553) time: 5.7192 data: 0.0002 max mem: 71323 +[07:35:54.373771] Epoch: [0] [2090/3112] lr: 0.000018 closs: 0.6364 (0.7325) grad_norm: 2.4342 (3.6490) time: 5.7265 data: 0.0002 max mem: 71323 +[07:36:51.580750] Epoch: [0] [2100/3112] lr: 0.000018 closs: 0.5548 (0.7318) grad_norm: 2.4342 (3.6425) time: 5.7224 data: 0.0002 max mem: 71323 +[07:37:48.980137] Epoch: [0] [2110/3112] lr: 0.000018 closs: 0.5420 (0.7309) grad_norm: 2.4342 (3.6378) time: 5.7302 data: 0.0002 max mem: 71323 +[07:38:46.431090] Epoch: [0] [2120/3112] lr: 0.000018 closs: 0.6151 (0.7308) grad_norm: 2.3922 (3.6311) time: 5.7424 data: 0.0001 max mem: 71323 +[07:39:43.583346] Epoch: [0] [2130/3112] lr: 0.000018 closs: 0.6662 (0.7304) grad_norm: 2.3083 (3.6240) time: 5.7300 data: 0.0002 max mem: 71323 +[07:40:40.745457] Epoch: [0] [2140/3112] lr: 0.000018 closs: 0.6662 (0.7321) grad_norm: 2.3083 (3.6253) time: 5.7156 data: 0.0002 max mem: 71323 +[07:41:38.020597] Epoch: [0] [2150/3112] lr: 0.000018 closs: 0.7295 (0.7326) grad_norm: 2.3083 (3.6223) time: 5.7218 data: 0.0002 max mem: 71323 +[07:42:35.322031] Epoch: [0] [2160/3112] lr: 0.000018 closs: 0.6320 (0.7315) grad_norm: 2.3083 (3.6235) time: 5.7287 data: 0.0001 max mem: 71323 +[07:43:32.654722] Epoch: [0] [2170/3112] lr: 0.000018 closs: 0.6320 (0.7315) grad_norm: 2.3328 (3.6221) time: 5.7315 data: 0.0002 max mem: 71323 +[07:44:29.914557] Epoch: [0] [2180/3112] lr: 0.000018 closs: 0.4974 (0.7305) grad_norm: 2.3922 (3.6169) time: 5.7295 data: 0.0002 max mem: 71323 +[07:45:27.091229] Epoch: [0] [2190/3112] lr: 0.000018 closs: 0.4557 (0.7298) grad_norm: 2.3922 (3.6113) time: 5.7217 data: 0.0002 max mem: 71323 +[07:46:24.255225] Epoch: [0] [2200/3112] lr: 0.000018 closs: 0.5135 (0.7288) grad_norm: 2.5733 (3.6104) time: 5.7169 data: 0.0002 max mem: 71323 +[07:47:21.502240] Epoch: [0] [2210/3112] lr: 0.000018 closs: 0.5135 (0.7283) grad_norm: 2.5977 (3.6074) time: 5.7204 data: 0.0002 max mem: 71323 +[07:48:18.795965] Epoch: [0] [2220/3112] lr: 0.000018 closs: 0.5792 (0.7288) grad_norm: 2.5977 (3.5992) time: 5.7269 data: 0.0002 max mem: 71323 +[07:49:16.012979] Epoch: [0] [2230/3112] lr: 0.000018 closs: 0.4997 (0.7278) grad_norm: 2.5733 (3.5917) time: 5.7254 data: 0.0002 max mem: 71323 +[07:50:13.203962] Epoch: [0] [2240/3112] lr: 0.000018 closs: 0.5082 (0.7277) grad_norm: 2.5733 (3.5912) time: 5.7203 data: 0.0002 max mem: 71323 +[07:51:10.397617] Epoch: [0] [2250/3112] lr: 0.000017 closs: 0.7118 (0.7286) grad_norm: 2.5372 (3.5848) time: 5.7191 data: 0.0002 max mem: 71323 +[07:52:07.654855] Epoch: [0] [2260/3112] lr: 0.000017 closs: 0.4129 (0.7274) grad_norm: 2.2236 (3.5790) time: 5.7224 data: 0.0002 max mem: 71323 +[07:53:05.112644] Epoch: [0] [2270/3112] lr: 0.000017 closs: 0.5399 (0.7276) grad_norm: 2.4273 (3.5763) time: 5.7356 data: 0.0001 max mem: 71323 +[07:54:02.436876] Epoch: [0] [2280/3112] lr: 0.000017 closs: 0.5124 (0.7264) grad_norm: 2.5085 (3.5782) time: 5.7390 data: 0.0002 max mem: 71323 +[07:54:59.581613] Epoch: [0] [2290/3112] lr: 0.000017 closs: 0.4684 (0.7255) grad_norm: 2.5085 (3.5738) time: 5.7233 data: 0.0002 max mem: 71323 +[07:55:56.848306] Epoch: [0] [2300/3112] lr: 0.000017 closs: 0.4999 (0.7251) grad_norm: 2.7753 (3.5764) time: 5.7205 data: 0.0001 max mem: 71323 +[07:56:54.148558] Epoch: [0] [2310/3112] lr: 0.000017 closs: 0.4999 (0.7243) grad_norm: 2.8772 (3.5737) time: 5.7282 data: 0.0001 max mem: 71323 +[07:57:51.305268] Epoch: [0] [2320/3112] lr: 0.000017 closs: 0.4652 (0.7244) grad_norm: 2.8772 (3.5769) time: 5.7227 data: 0.0002 max mem: 71323 +[07:58:48.758125] Epoch: [0] [2330/3112] lr: 0.000017 closs: 0.4826 (0.7235) grad_norm: 3.0775 (3.5767) time: 5.7304 data: 0.0002 max mem: 71323 +[07:59:45.901830] Epoch: [0] [2340/3112] lr: 0.000017 closs: 0.5619 (0.7238) grad_norm: 3.0775 (3.5686) time: 5.7297 data: 0.0002 max mem: 71323 +[08:00:42.979463] Epoch: [0] [2350/3112] lr: 0.000017 closs: 0.5979 (0.7237) grad_norm: 3.1182 (3.5693) time: 5.7109 data: 0.0002 max mem: 71323 +[08:01:40.247738] Epoch: [0] [2360/3112] lr: 0.000017 closs: 0.7751 (0.7241) grad_norm: 3.5811 (3.6443) time: 5.7171 data: 0.0003 max mem: 71323 +[08:02:37.501019] Epoch: [0] [2370/3112] lr: 0.000017 closs: 0.7241 (0.7239) grad_norm: 3.5811 (3.6401) time: 5.7259 data: 0.0003 max mem: 71323 +[08:03:34.656352] Epoch: [0] [2380/3112] lr: 0.000017 closs: 0.6760 (0.7237) grad_norm: 2.4389 (3.6314) time: 5.7203 data: 0.0002 max mem: 71323 +[08:04:31.870672] Epoch: [0] [2390/3112] lr: 0.000017 closs: 0.6843 (0.7238) grad_norm: 2.3977 (3.6235) time: 5.7183 data: 0.0002 max mem: 71323 +[08:05:29.039660] Epoch: [0] [2400/3112] lr: 0.000017 closs: 0.5994 (0.7229) grad_norm: 2.3378 (3.6193) time: 5.7190 data: 0.0002 max mem: 71323 +[08:06:26.270395] Epoch: [0] [2410/3112] lr: 0.000017 closs: 0.5742 (0.7229) grad_norm: 2.3249 (3.6149) time: 5.7198 data: 0.0002 max mem: 71323 +[08:07:23.447178] Epoch: [0] [2420/3112] lr: 0.000017 closs: 0.6539 (0.7227) grad_norm: 2.3967 (3.6299) time: 5.7202 data: 0.0002 max mem: 71323 +[08:08:20.647939] Epoch: [0] [2430/3112] lr: 0.000017 closs: 0.6545 (0.7226) grad_norm: 2.4389 (3.6284) time: 5.7187 data: 0.0001 max mem: 71323 +[08:09:17.890492] Epoch: [0] [2440/3112] lr: 0.000017 closs: 0.5689 (0.7232) grad_norm: 2.3378 (3.6236) time: 5.7220 data: 0.0002 max mem: 71323 +[08:10:15.037537] Epoch: [0] [2450/3112] lr: 0.000017 closs: 0.6907 (0.7230) grad_norm: 2.3378 (3.6281) time: 5.7193 data: 0.0002 max mem: 71323 +[08:11:12.222638] Epoch: [0] [2460/3112] lr: 0.000017 closs: 0.5892 (0.7225) grad_norm: 2.6056 (3.6223) time: 5.7164 data: 0.0002 max mem: 71323 +[08:12:09.560452] Epoch: [0] [2470/3112] lr: 0.000017 closs: 0.6734 (0.7232) grad_norm: 2.6194 (3.6179) time: 5.7260 data: 0.0002 max mem: 71323 +[08:13:06.821109] Epoch: [0] [2480/3112] lr: 0.000017 closs: 0.8070 (0.7234) grad_norm: 2.6194 (3.6123) time: 5.7297 data: 0.0002 max mem: 71323 +[08:14:04.122432] Epoch: [0] [2490/3112] lr: 0.000017 closs: 0.6555 (0.7231) grad_norm: 2.7133 (3.6149) time: 5.7279 data: 0.0002 max mem: 71323 +[08:15:01.375411] Epoch: [0] [2500/3112] lr: 0.000017 closs: 0.4324 (0.7224) grad_norm: 2.7133 (3.6283) time: 5.7276 data: 0.0002 max mem: 71323 +[08:15:58.593116] Epoch: [0] [2510/3112] lr: 0.000017 closs: 0.4084 (0.7210) grad_norm: 2.7133 (3.6258) time: 5.7234 data: 0.0002 max mem: 71323 +[08:16:55.848610] Epoch: [0] [2520/3112] lr: 0.000017 closs: 0.4026 (0.7197) grad_norm: 3.1204 (3.6258) time: 5.7235 data: 0.0002 max mem: 71323 +[08:17:53.314274] Epoch: [0] [2530/3112] lr: 0.000017 closs: 0.5344 (0.7200) grad_norm: 3.1204 (3.6278) time: 5.7359 data: 0.0002 max mem: 71323 +[08:18:50.536905] Epoch: [0] [2540/3112] lr: 0.000017 closs: 0.6291 (0.7195) grad_norm: 3.1204 (3.6305) time: 5.7343 data: 0.0002 max mem: 71323 +[08:19:47.644893] Epoch: [0] [2550/3112] lr: 0.000017 closs: 0.5104 (0.7190) grad_norm: 3.1204 (3.6254) time: 5.7164 data: 0.0002 max mem: 71323 +[08:20:44.763872] Epoch: [0] [2560/3112] lr: 0.000017 closs: 0.5104 (0.7187) grad_norm: 3.1204 (3.6259) time: 5.7112 data: 0.0002 max mem: 71323 +[08:21:41.912744] Epoch: [0] [2570/3112] lr: 0.000017 closs: 0.6185 (0.7186) grad_norm: 3.1020 (3.6299) time: 5.7132 data: 0.0002 max mem: 71323 +[08:22:39.047092] Epoch: [0] [2580/3112] lr: 0.000017 closs: 0.5959 (0.7182) grad_norm: 2.3011 (3.6200) time: 5.7140 data: 0.0002 max mem: 71323 +[08:23:36.258852] Epoch: [0] [2590/3112] lr: 0.000017 closs: 0.5658 (0.7179) grad_norm: 2.1841 (3.6123) time: 5.7171 data: 0.0002 max mem: 71323 +[08:24:33.587450] Epoch: [0] [2600/3112] lr: 0.000017 closs: 0.3892 (0.7170) grad_norm: 2.3011 (3.6141) time: 5.7269 data: 0.0002 max mem: 71323 +[08:25:30.834611] Epoch: [0] [2610/3112] lr: 0.000017 closs: 0.6581 (0.7179) grad_norm: 2.1841 (3.6122) time: 5.7286 data: 0.0002 max mem: 71323 +[08:26:28.066748] Epoch: [0] [2620/3112] lr: 0.000017 closs: 0.6581 (0.7182) grad_norm: 2.0764 (3.6137) time: 5.7238 data: 0.0002 max mem: 71323 +[08:27:25.324375] Epoch: [0] [2630/3112] lr: 0.000017 closs: 0.6386 (0.7173) grad_norm: 2.3011 (3.6248) time: 5.7243 data: 0.0002 max mem: 71323 +[08:28:22.514013] Epoch: [0] [2640/3112] lr: 0.000017 closs: 0.4277 (0.7160) grad_norm: 2.4036 (3.6359) time: 5.7222 data: 0.0002 max mem: 71323 +[08:29:19.822807] Epoch: [0] [2650/3112] lr: 0.000017 closs: 0.4277 (0.7158) grad_norm: 2.4036 (3.6328) time: 5.7247 data: 0.0002 max mem: 71323 +[08:30:17.155743] Epoch: [0] [2660/3112] lr: 0.000016 closs: 0.6473 (0.7156) grad_norm: 2.7300 (3.6260) time: 5.7319 data: 0.0002 max mem: 71323 +[08:31:14.352387] Epoch: [0] [2670/3112] lr: 0.000016 closs: 0.5801 (0.7154) grad_norm: 3.4464 (3.6264) time: 5.7263 data: 0.0001 max mem: 71323 +[08:32:11.654306] Epoch: [0] [2680/3112] lr: 0.000016 closs: 0.3925 (0.7140) grad_norm: 2.7300 (3.6208) time: 5.7248 data: 0.0001 max mem: 71323 +[08:33:08.926822] Epoch: [0] [2690/3112] lr: 0.000016 closs: 0.4998 (0.7137) grad_norm: 2.7300 (3.6176) time: 5.7286 data: 0.0001 max mem: 71323 +[08:34:06.154199] Epoch: [0] [2700/3112] lr: 0.000016 closs: 0.6299 (0.7137) grad_norm: 2.6203 (3.6119) time: 5.7248 data: 0.0002 max mem: 71323 +[08:35:03.305952] Epoch: [0] [2710/3112] lr: 0.000016 closs: 0.4535 (0.7133) grad_norm: 2.5608 (3.6117) time: 5.7188 data: 0.0002 max mem: 71323 +[08:36:00.521390] Epoch: [0] [2720/3112] lr: 0.000016 closs: 0.4535 (0.7135) grad_norm: 2.5608 (3.6046) time: 5.7183 data: 0.0002 max mem: 71323 +[08:36:58.017736] Epoch: [0] [2730/3112] lr: 0.000016 closs: 0.5954 (0.7134) grad_norm: 2.4625 (3.5983) time: 5.7355 data: 0.0002 max mem: 71323 +[08:37:55.335685] Epoch: [0] [2740/3112] lr: 0.000016 closs: 0.5954 (0.7128) grad_norm: 2.5608 (3.5948) time: 5.7406 data: 0.0002 max mem: 71323 +[08:38:52.805052] Epoch: [0] [2750/3112] lr: 0.000016 closs: 0.4398 (0.7117) grad_norm: 2.5608 (3.5929) time: 5.7393 data: 0.0002 max mem: 71323 +[08:39:50.041925] Epoch: [0] [2760/3112] lr: 0.000016 closs: 0.4101 (0.7109) grad_norm: 2.5608 (3.5919) time: 5.7352 data: 0.0002 max mem: 71323 +[08:40:47.182083] Epoch: [0] [2770/3112] lr: 0.000016 closs: 0.5106 (0.7112) grad_norm: 2.9051 (3.5924) time: 5.7188 data: 0.0002 max mem: 71323 +[08:41:44.303731] Epoch: [0] [2780/3112] lr: 0.000016 closs: 0.3585 (0.7101) grad_norm: 2.9051 (3.5884) time: 5.7130 data: 0.0002 max mem: 71323 +[08:42:41.443374] Epoch: [0] [2790/3112] lr: 0.000016 closs: 0.3585 (0.7094) grad_norm: 2.9519 (3.5890) time: 5.7130 data: 0.0002 max mem: 71323 +[08:43:38.613208] Epoch: [0] [2800/3112] lr: 0.000016 closs: 0.4500 (0.7090) grad_norm: 2.9519 (3.5840) time: 5.7154 data: 0.0002 max mem: 71323 +[08:44:35.976551] Epoch: [0] [2810/3112] lr: 0.000016 closs: 0.4857 (0.7086) grad_norm: 2.9519 (3.5801) time: 5.7265 data: 0.0002 max mem: 71323 +[08:45:33.197799] Epoch: [0] [2820/3112] lr: 0.000016 closs: 0.4857 (0.7081) grad_norm: 2.7940 (3.5754) time: 5.7291 data: 0.0002 max mem: 71323 +[08:46:30.284000] Epoch: [0] [2830/3112] lr: 0.000016 closs: 0.4458 (0.7076) grad_norm: 2.5244 (3.5736) time: 5.7152 data: 0.0002 max mem: 71323 +[08:47:27.504292] Epoch: [0] [2840/3112] lr: 0.000016 closs: 0.4260 (0.7072) grad_norm: 2.5244 (3.5727) time: 5.7152 data: 0.0002 max mem: 71323 +[08:48:24.810958] Epoch: [0] [2850/3112] lr: 0.000016 closs: 0.5686 (0.7066) grad_norm: 2.4720 (3.5690) time: 5.7263 data: 0.0002 max mem: 71323 +[08:49:22.095994] Epoch: [0] [2860/3112] lr: 0.000016 closs: 0.5686 (0.7062) grad_norm: 2.4755 (3.5653) time: 5.7295 data: 0.0002 max mem: 71323 +[08:50:19.255975] Epoch: [0] [2870/3112] lr: 0.000016 closs: 0.7656 (0.7070) grad_norm: 2.4720 (3.5631) time: 5.7221 data: 0.0002 max mem: 71323 +[08:51:16.579503] Epoch: [0] [2880/3112] lr: 0.000016 closs: 0.6600 (0.7065) grad_norm: 2.6178 (3.5606) time: 5.7240 data: 0.0002 max mem: 71323 +[08:52:13.829811] Epoch: [0] [2890/3112] lr: 0.000016 closs: 0.4978 (0.7055) grad_norm: 2.6516 (3.5602) time: 5.7286 data: 0.0002 max mem: 71323 +[08:53:11.066340] Epoch: [0] [2900/3112] lr: 0.000016 closs: 0.3908 (0.7047) grad_norm: 2.8129 (3.5576) time: 5.7242 data: 0.0002 max mem: 71323 +[08:54:08.348939] Epoch: [0] [2910/3112] lr: 0.000016 closs: 0.4668 (0.7043) grad_norm: 2.8129 (3.5569) time: 5.7258 data: 0.0002 max mem: 71323 +[08:55:05.614213] Epoch: [0] [2920/3112] lr: 0.000016 closs: 0.4462 (0.7035) grad_norm: 2.8254 (3.5613) time: 5.7273 data: 0.0002 max mem: 71323 +[08:56:02.803387] Epoch: [0] [2930/3112] lr: 0.000016 closs: 0.4677 (0.7032) grad_norm: 2.8129 (3.5571) time: 5.7226 data: 0.0002 max mem: 71323 +[08:57:00.014117] Epoch: [0] [2940/3112] lr: 0.000016 closs: 0.4677 (0.7032) grad_norm: 2.8254 (3.5568) time: 5.7199 data: 0.0001 max mem: 71323 +[08:57:57.275301] Epoch: [0] [2950/3112] lr: 0.000016 closs: 0.4380 (0.7040) grad_norm: 3.1264 (3.5574) time: 5.7235 data: 0.0002 max mem: 71323 +[08:58:54.434087] Epoch: [0] [2960/3112] lr: 0.000016 closs: 0.5420 (0.7039) grad_norm: 3.8482 (3.5603) time: 5.7209 data: 0.0002 max mem: 71323 +[08:59:51.823237] Epoch: [0] [2970/3112] lr: 0.000016 closs: 0.4128 (0.7030) grad_norm: 3.8232 (3.5588) time: 5.7273 data: 0.0002 max mem: 71323 +[09:00:49.101344] Epoch: [0] [2980/3112] lr: 0.000016 closs: 0.4071 (0.7027) grad_norm: 3.8232 (3.5538) time: 5.7332 data: 0.0002 max mem: 71323 +[09:01:46.347880] Epoch: [0] [2990/3112] lr: 0.000016 closs: 0.3401 (0.7020) grad_norm: 3.4265 (3.5510) time: 5.7261 data: 0.0002 max mem: 71323 +[09:02:43.604127] Epoch: [0] [3000/3112] lr: 0.000016 closs: 0.4397 (0.7017) grad_norm: 2.4524 (3.5437) time: 5.7250 data: 0.0001 max mem: 71323 +[09:03:40.732892] Epoch: [0] [3010/3112] lr: 0.000016 closs: 0.5497 (0.7013) grad_norm: 2.4199 (3.5393) time: 5.7191 data: 0.0002 max mem: 71323 +[09:04:37.973927] Epoch: [0] [3020/3112] lr: 0.000016 closs: 0.4952 (0.7011) grad_norm: 2.3414 (3.5381) time: 5.7184 data: 0.0002 max mem: 71323 +[09:05:35.167066] Epoch: [0] [3030/3112] lr: 0.000015 closs: 0.4716 (0.7015) grad_norm: 2.1653 (3.5330) time: 5.7216 data: 0.0002 max mem: 71323 +[09:06:32.532510] Epoch: [0] [3040/3112] lr: 0.000015 closs: 0.4667 (0.7006) grad_norm: 2.0589 (3.5329) time: 5.7277 data: 0.0002 max mem: 71323 +[09:07:29.727860] Epoch: [0] [3050/3112] lr: 0.000015 closs: 0.4919 (0.7002) grad_norm: 2.0589 (3.5326) time: 5.7279 data: 0.0002 max mem: 71323 +[09:08:27.088781] Epoch: [0] [3060/3112] lr: 0.000015 closs: 0.5159 (0.7000) grad_norm: 2.0589 (3.5265) time: 5.7276 data: 0.0002 max mem: 71323 +[09:09:24.338843] Epoch: [0] [3070/3112] lr: 0.000015 closs: 0.4659 (0.6995) grad_norm: 1.9044 (3.5231) time: 5.7304 data: 0.0002 max mem: 71323 +[09:10:21.570985] Epoch: [0] [3080/3112] lr: 0.000015 closs: 0.5066 (0.6994) grad_norm: 2.0589 (3.5214) time: 5.7240 data: 0.0002 max mem: 71323 +[09:11:18.712905] Epoch: [0] [3090/3112] lr: 0.000015 closs: 0.5204 (0.6995) grad_norm: 2.1653 (3.5199) time: 5.7185 data: 0.0002 max mem: 71323 +[09:12:15.985878] Epoch: [0] [3100/3112] lr: 0.000015 closs: 0.6277 (0.6992) grad_norm: 2.5048 (3.5162) time: 5.7206 data: 0.0002 max mem: 71323 +[09:13:13.180856] Epoch: [0] [3110/3112] lr: 0.000015 closs: 0.6988 (0.6997) grad_norm: 2.8115 (3.5252) time: 5.7232 data: 0.0001 max mem: 71323 +[09:13:19.286067] Epoch: [0] Total time: 4:57:07 +[09:13:19.287341] Averaged stats: lr: 0.000015 closs: 0.6485 (0.7013) grad_norm: 2.8115 (3.5305) +/root/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py:1110: UserWarning: ``FullyShardedDataParallel.full_optim_state_dict``is being deprecated and is replaced by ``FullyShardedDataParallel.optim_state_dict``. ``FullyShardedDataParallel.full_optim_state_dict`` may be removed after PyTorch 2.2. + warnings.warn( +/root/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py:1110: UserWarning: ``FullyShardedDataParallel.full_optim_state_dict``is being deprecated and is replaced by ``FullyShardedDataParallel.optim_state_dict``. ``FullyShardedDataParallel.full_optim_state_dict`` may be removed after PyTorch 2.2. + warnings.warn( +/root/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py:1110: UserWarning: ``FullyShardedDataParallel.full_optim_state_dict``is being deprecated and is replaced by ``FullyShardedDataParallel.optim_state_dict``. ``FullyShardedDataParallel.full_optim_state_dict`` may be removed after PyTorch 2.2. + warnings.warn( +/root/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py:1110: UserWarning: ``FullyShardedDataParallel.full_optim_state_dict``is being deprecated and is replaced by ``FullyShardedDataParallel.optim_state_dict``. ``FullyShardedDataParallel.full_optim_state_dict`` may be removed after PyTorch 2.2. + warnings.warn( +/root/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py:1110: UserWarning: ``FullyShardedDataParallel.full_optim_state_dict``is being deprecated and is replaced by ``FullyShardedDataParallel.optim_state_dict``. ``FullyShardedDataParallel.full_optim_state_dict`` may be removed after PyTorch 2.2. + warnings.warn( +/root/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py:1110: UserWarning: ``FullyShardedDataParallel.full_optim_state_dict``is being deprecated and is replaced by ``FullyShardedDataParallel.optim_state_dict``. ``FullyShardedDataParallel.full_optim_state_dict`` may be removed after PyTorch 2.2. + warnings.warn( +/root/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py:1110: UserWarning: ``FullyShardedDataParallel.full_optim_state_dict``is being deprecated and is replaced by ``FullyShardedDataParallel.optim_state_dict``. ``FullyShardedDataParallel.full_optim_state_dict`` may be removed after PyTorch 2.2. + warnings.warn( +[09:13:19.461393] model saved +/root/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py:1110: UserWarning: ``FullyShardedDataParallel.full_optim_state_dict``is being deprecated and is replaced by ``FullyShardedDataParallel.optim_state_dict``. ``FullyShardedDataParallel.full_optim_state_dict`` may be removed after PyTorch 2.2. + warnings.warn( +[09:13:21.167045] optimizer saved +[09:13:21.167699] other rank-common saved +[09:13:21.171329] rank-specific saved +[09:13:21.181596] log_dir: ./output_dir +[09:13:28.427242] Epoch: [1] [0/3112] lr: 0.000015 closs: 0.7382 (0.7382) time: 7.2451 data: 1.5429 max mem: 71323 +[09:14:25.498090] Epoch: [1] [10/3112] lr: 0.000015 closs: 0.4170 (0.5087) grad_norm: 3.1735 (3.1748) time: 5.8468 data: 0.1404 max mem: 71323 +[09:15:22.730831] Epoch: [1] [20/3112] lr: 0.000015 closs: 0.4801 (0.5733) grad_norm: 2.4090 (2.3285) time: 5.7151 data: 0.0002 max mem: 71323 +[09:16:20.014569] Epoch: [1] [30/3112] lr: 0.000015 closs: 0.6003 (0.6413) grad_norm: 2.4090 (2.2568) time: 5.7257 data: 0.0002 max mem: 71323 +[09:17:17.189994] Epoch: [1] [40/3112] lr: 0.000015 closs: 0.5446 (0.6601) grad_norm: 2.4090 (2.4746) time: 5.7229 data: 0.0002 max mem: 71323 +[09:18:14.434741] Epoch: [1] [50/3112] lr: 0.000015 closs: 0.4765 (0.6157) grad_norm: 2.4090 (3.0298) time: 5.7209 data: 0.0002 max mem: 71323 +[09:19:11.700171] Epoch: [1] [60/3112] lr: 0.000015 closs: 0.5167 (0.6122) grad_norm: 2.4459 (2.8109) time: 5.7254 data: 0.0002 max mem: 71323 +[09:20:08.912256] Epoch: [1] [70/3112] lr: 0.000015 closs: 0.4861 (0.5957) grad_norm: 2.5363 (3.0141) time: 5.7238 data: 0.0002 max mem: 71323 +[09:21:06.068977] Epoch: [1] [80/3112] lr: 0.000015 closs: 0.6749 (0.6078) grad_norm: 2.5026 (2.9230) time: 5.7183 data: 0.0002 max mem: 71323 +[09:22:03.232511] Epoch: [1] [90/3112] lr: 0.000015 closs: 0.6540 (0.5932) grad_norm: 2.4090 (2.7984) time: 5.7159 data: 0.0002 max mem: 71323 +[09:23:00.422832] Epoch: [1] [100/3112] lr: 0.000015 closs: 0.6500 (0.6166) grad_norm: 2.5363 (2.9734) time: 5.7176 data: 0.0002 max mem: 71323 +[09:23:57.634147] Epoch: [1] [110/3112] lr: 0.000015 closs: 0.6491 (0.6058) grad_norm: 2.5026 (2.8661) time: 5.7200 data: 0.0002 max mem: 71323 +[09:24:55.020395] Epoch: [1] [120/3112] lr: 0.000015 closs: 0.3937 (0.5968) grad_norm: 2.5363 (3.0561) time: 5.7298 data: 0.0002 max mem: 71323 +[09:25:52.337097] Epoch: [1] [130/3112] lr: 0.000015 closs: 0.3740 (0.5910) grad_norm: 2.5101 (3.0160) time: 5.7351 data: 0.0002 max mem: 71323 +[09:26:49.545038] Epoch: [1] [140/3112] lr: 0.000015 closs: 0.5434 (0.5974) grad_norm: 2.5101 (2.9740) time: 5.7261 data: 0.0002 max mem: 71323 +[09:27:46.816821] Epoch: [1] [150/3112] lr: 0.000015 closs: 0.5664 (0.5961) grad_norm: 2.5026 (2.9462) time: 5.7238 data: 0.0002 max mem: 71323 +[09:28:44.005956] Epoch: [1] [160/3112] lr: 0.000015 closs: 0.4724 (0.5938) grad_norm: 2.5101 (3.0338) time: 5.7229 data: 0.0002 max mem: 71323 +[09:29:41.124581] Epoch: [1] [170/3112] lr: 0.000015 closs: 0.5469 (0.5998) grad_norm: 2.5279 (2.9984) time: 5.7153 data: 0.0002 max mem: 71323 +[09:30:38.428538] Epoch: [1] [180/3112] lr: 0.000015 closs: 0.5174 (0.5855) grad_norm: 2.3213 (2.9285) time: 5.7210 data: 0.0002 max mem: 71323 +[09:31:35.610804] Epoch: [1] [190/3112] lr: 0.000015 closs: 0.4193 (0.5875) grad_norm: 2.5101 (2.9094) time: 5.7242 data: 0.0002 max mem: 71323 +[09:32:32.867799] Epoch: [1] [200/3112] lr: 0.000015 closs: 0.4952 (0.5797) grad_norm: 2.3213 (2.8696) time: 5.7218 data: 0.0002 max mem: 71323 +[09:33:30.162583] Epoch: [1] [210/3112] lr: 0.000015 closs: 0.5128 (0.5870) grad_norm: 2.3126 (2.8529) time: 5.7274 data: 0.0002 max mem: 71323 +[09:34:27.378116] Epoch: [1] [220/3112] lr: 0.000015 closs: 0.6949 (0.5925) grad_norm: 2.3126 (2.8630) time: 5.7253 data: 0.0002 max mem: 71323 +[09:35:24.555963] Epoch: [1] [230/3112] lr: 0.000015 closs: 0.6526 (0.5907) grad_norm: 2.1777 (2.8258) time: 5.7195 data: 0.0002 max mem: 71323 +[09:36:21.830213] Epoch: [1] [240/3112] lr: 0.000015 closs: 0.6494 (0.5972) grad_norm: 2.3126 (2.8309) time: 5.7225 data: 0.0002 max mem: 71323 +[09:37:18.971139] Epoch: [1] [250/3112] lr: 0.000015 closs: 0.6167 (0.5913) grad_norm: 2.3126 (2.9036) time: 5.7207 data: 0.0002 max mem: 71323 +[09:38:16.156640] Epoch: [1] [260/3112] lr: 0.000014 closs: 0.4620 (0.5917) grad_norm: 2.4741 (2.9573) time: 5.7162 data: 0.0002 max mem: 71323 +[09:39:13.349394] Epoch: [1] [270/3112] lr: 0.000014 closs: 0.5001 (0.5893) grad_norm: 2.4741 (3.0835) time: 5.7187 data: 0.0002 max mem: 71323 +[09:40:10.767725] Epoch: [1] [280/3112] lr: 0.000014 closs: 0.5001 (0.5882) grad_norm: 2.5559 (3.2597) time: 5.7304 data: 0.0002 max mem: 71323 +[09:41:08.217935] Epoch: [1] [290/3112] lr: 0.000014 closs: 0.5280 (0.5890) grad_norm: 2.7970 (3.2470) time: 5.7433 data: 0.0002 max mem: 71323 +[09:42:05.390828] Epoch: [1] [300/3112] lr: 0.000014 closs: 0.4845 (0.5870) grad_norm: 2.8056 (3.2733) time: 5.7311 data: 0.0002 max mem: 71323 +[09:43:02.735847] Epoch: [1] [310/3112] lr: 0.000014 closs: 0.4630 (0.5914) grad_norm: 3.1784 (3.2719) time: 5.7258 data: 0.0002 max mem: 71323 +[09:43:59.872682] Epoch: [1] [320/3112] lr: 0.000014 closs: 0.5880 (0.5949) grad_norm: 3.6826 (3.2877) time: 5.7239 data: 0.0002 max mem: 71323 +[09:44:57.096264] Epoch: [1] [330/3112] lr: 0.000014 closs: 0.6551 (0.5953) grad_norm: 3.6725 (3.2666) time: 5.7179 data: 0.0002 max mem: 71323 +[09:45:54.300111] Epoch: [1] [340/3112] lr: 0.000014 closs: 0.6551 (0.5981) grad_norm: 2.7940 (3.2398) time: 5.7212 data: 0.0002 max mem: 71323 +[09:46:51.587846] Epoch: [1] [350/3112] lr: 0.000014 closs: 0.5537 (0.5990) grad_norm: 2.7940 (3.2256) time: 5.7245 data: 0.0002 max mem: 71323 +[09:47:48.846949] Epoch: [1] [360/3112] lr: 0.000014 closs: 0.4309 (0.5962) grad_norm: 2.7940 (3.2131) time: 5.7272 data: 0.0002 max mem: 71323 +[09:48:46.044236] Epoch: [1] [370/3112] lr: 0.000014 closs: 0.4145 (0.5929) grad_norm: 2.6962 (3.1840) time: 5.7227 data: 0.0002 max mem: 71323 +[09:49:43.297324] Epoch: [1] [380/3112] lr: 0.000014 closs: 0.3955 (0.5863) grad_norm: 2.6962 (3.2518) time: 5.7224 data: 0.0002 max mem: 71323 +[09:50:40.368656] Epoch: [1] [390/3112] lr: 0.000014 closs: 0.3774 (0.5848) grad_norm: 2.7940 (3.3040) time: 5.7161 data: 0.0002 max mem: 71323 +[09:51:37.416368] Epoch: [1] [400/3112] lr: 0.000014 closs: 0.3774 (0.5828) grad_norm: 2.5880 (3.2691) time: 5.7058 data: 0.0002 max mem: 71323 +[09:52:34.687555] Epoch: [1] [410/3112] lr: 0.000014 closs: 0.4234 (0.5869) grad_norm: 2.4511 (3.2418) time: 5.7158 data: 0.0002 max mem: 71323 +[09:53:31.819333] Epoch: [1] [420/3112] lr: 0.000014 closs: 0.5497 (0.5877) grad_norm: 2.5880 (3.2550) time: 5.7200 data: 0.0002 max mem: 71323 +[09:54:28.979365] Epoch: [1] [430/3112] lr: 0.000014 closs: 0.6290 (0.5925) grad_norm: 2.4511 (3.2180) time: 5.7145 data: 0.0002 max mem: 71323 +[09:55:26.391158] Epoch: [1] [440/3112] lr: 0.000014 closs: 0.5628 (0.5879) grad_norm: 2.4008 (3.2018) time: 5.7284 data: 0.0002 max mem: 71323 +[09:56:23.706046] Epoch: [1] [450/3112] lr: 0.000014 closs: 0.3941 (0.5864) grad_norm: 2.4008 (3.2221) time: 5.7362 data: 0.0002 max mem: 71323 +[09:57:20.971356] Epoch: [1] [460/3112] lr: 0.000014 closs: 0.5131 (0.5866) grad_norm: 2.4008 (3.1983) time: 5.7289 data: 0.0002 max mem: 71323 +[09:58:18.150060] Epoch: [1] [470/3112] lr: 0.000014 closs: 0.5131 (0.5862) grad_norm: 2.4008 (3.2323) time: 5.7221 data: 0.0002 max mem: 71323 +[09:59:15.375494] Epoch: [1] [480/3112] lr: 0.000014 closs: 0.4989 (0.5830) grad_norm: 2.7209 (3.2148) time: 5.7201 data: 0.0002 max mem: 71323 +[10:00:12.756426] Epoch: [1] [490/3112] lr: 0.000014 closs: 0.4090 (0.5823) grad_norm: 2.9341 (3.2141) time: 5.7302 data: 0.0002 max mem: 71323 +[10:01:10.155186] Epoch: [1] [500/3112] lr: 0.000014 closs: 0.4196 (0.5858) grad_norm: 2.9341 (3.2366) time: 5.7389 data: 0.0002 max mem: 71323 +[10:02:07.419313] Epoch: [1] [510/3112] lr: 0.000014 closs: 0.4680 (0.5842) grad_norm: 3.2766 (3.2527) time: 5.7330 data: 0.0002 max mem: 71323 +[10:03:04.705425] Epoch: [1] [520/3112] lr: 0.000014 closs: 0.5118 (0.5871) grad_norm: 2.9341 (3.2166) time: 5.7273 data: 0.0002 max mem: 71323 +[10:04:01.812210] Epoch: [1] [530/3112] lr: 0.000014 closs: 0.6283 (0.5862) grad_norm: 2.9341 (3.2362) time: 5.7195 data: 0.0002 max mem: 71323 +[10:04:58.925349] Epoch: [1] [540/3112] lr: 0.000014 closs: 0.6283 (0.5870) grad_norm: 3.2742 (3.2327) time: 5.7109 data: 0.0002 max mem: 71323 +[10:05:56.126245] Epoch: [1] [550/3112] lr: 0.000014 closs: 0.6905 (0.5943) grad_norm: 2.4639 (3.2662) time: 5.7156 data: 0.0002 max mem: 71323 +[10:06:53.279299] Epoch: [1] [560/3112] lr: 0.000014 closs: 0.5900 (0.5929) grad_norm: 3.4900 (3.3006) time: 5.7176 data: 0.0002 max mem: 71323 +[10:07:50.482632] Epoch: [1] [570/3112] lr: 0.000014 closs: 0.4469 (0.5922) grad_norm: 3.1543 (3.2974) time: 5.7177 data: 0.0002 max mem: 71323 +[10:08:47.826041] Epoch: [1] [580/3112] lr: 0.000013 closs: 0.5951 (0.5959) grad_norm: 2.6942 (3.2738) time: 5.7272 data: 0.0002 max mem: 71323 +[10:09:45.133692] Epoch: [1] [590/3112] lr: 0.000013 closs: 0.5321 (0.5913) grad_norm: 2.4965 (3.2625) time: 5.7325 data: 0.0002 max mem: 71323 +[10:10:42.484961] Epoch: [1] [600/3112] lr: 0.000013 closs: 0.3357 (0.5920) grad_norm: 2.9112 (3.2604) time: 5.7329 data: 0.0002 max mem: 71323 +[10:11:39.972485] Epoch: [1] [610/3112] lr: 0.000013 closs: 0.4469 (0.5962) grad_norm: 2.9112 (3.2624) time: 5.7419 data: 0.0002 max mem: 71323 +[10:12:36.983207] Epoch: [1] [620/3112] lr: 0.000013 closs: 0.2738 (0.5933) grad_norm: 2.9893 (3.2781) time: 5.7248 data: 0.0002 max mem: 71323 +[10:13:34.158905] Epoch: [1] [630/3112] lr: 0.000013 closs: 0.2682 (0.5921) grad_norm: 2.9893 (3.2718) time: 5.7091 data: 0.0002 max mem: 71323 +[10:14:31.501670] Epoch: [1] [640/3112] lr: 0.000013 closs: 0.5042 (0.5939) grad_norm: 2.9893 (3.3001) time: 5.7257 data: 0.0002 max mem: 71323 +[10:15:28.778460] Epoch: [1] [650/3112] lr: 0.000013 closs: 0.5042 (0.5950) grad_norm: 2.9112 (3.2839) time: 5.7308 data: 0.0002 max mem: 71323 +[10:16:26.016804] Epoch: [1] [660/3112] lr: 0.000013 closs: 0.4725 (0.5929) grad_norm: 2.9637 (3.3006) time: 5.7256 data: 0.0002 max mem: 71323 +[10:17:23.255978] Epoch: [1] [670/3112] lr: 0.000013 closs: 0.3954 (0.5896) grad_norm: 2.9112 (3.2894) time: 5.7237 data: 0.0002 max mem: 71323 +[10:18:20.447344] Epoch: [1] [680/3112] lr: 0.000013 closs: 0.4779 (0.5903) grad_norm: 2.8923 (3.2849) time: 5.7214 data: 0.0001 max mem: 71323 +[10:19:17.792473] Epoch: [1] [690/3112] lr: 0.000013 closs: 0.5278 (0.5901) grad_norm: 3.1712 (3.2944) time: 5.7267 data: 0.0002 max mem: 71323 +[10:20:15.021394] Epoch: [1] [700/3112] lr: 0.000013 closs: 0.5613 (0.5935) grad_norm: 2.8923 (3.3143) time: 5.7285 data: 0.0002 max mem: 71323 +[10:21:12.249507] Epoch: [1] [710/3112] lr: 0.000013 closs: 0.4689 (0.5929) grad_norm: 2.8923 (3.3155) time: 5.7227 data: 0.0002 max mem: 71323 +[10:22:09.416494] Epoch: [1] [720/3112] lr: 0.000013 closs: 0.3400 (0.5918) grad_norm: 2.6869 (3.3072) time: 5.7196 data: 0.0002 max mem: 71323 +[10:23:06.716988] Epoch: [1] [730/3112] lr: 0.000013 closs: 0.4069 (0.5926) grad_norm: 2.6869 (3.3344) time: 5.7232 data: 0.0001 max mem: 71323 +[10:24:03.950324] Epoch: [1] [740/3112] lr: 0.000013 closs: 0.4222 (0.5914) grad_norm: 2.8923 (3.3575) time: 5.7266 data: 0.0002 max mem: 71323 +[10:25:01.053011] Epoch: [1] [750/3112] lr: 0.000013 closs: 0.4662 (0.5911) grad_norm: 2.8923 (3.3481) time: 5.7167 data: 0.0002 max mem: 71323 +[10:25:58.414720] Epoch: [1] [760/3112] lr: 0.000013 closs: 0.4662 (0.5902) grad_norm: 2.6641 (3.3131) time: 5.7230 data: 0.0002 max mem: 71323 +[10:26:55.660258] Epoch: [1] [770/3112] lr: 0.000013 closs: 0.5543 (0.5914) grad_norm: 2.6641 (3.3250) time: 5.7302 data: 0.0002 max mem: 71323 +[10:27:52.939005] Epoch: [1] [780/3112] lr: 0.000013 closs: 0.5727 (0.5924) grad_norm: 2.6641 (3.3267) time: 5.7261 data: 0.0002 max mem: 71323 +[10:28:50.109997] Epoch: [1] [790/3112] lr: 0.000013 closs: 0.5498 (0.5930) grad_norm: 2.6724 (3.3225) time: 5.7223 data: 0.0001 max mem: 71323 +[10:29:47.464775] Epoch: [1] [800/3112] lr: 0.000013 closs: 0.4946 (0.5913) grad_norm: 2.6641 (3.3178) time: 5.7261 data: 0.0002 max mem: 71323 +[10:30:44.724220] Epoch: [1] [810/3112] lr: 0.000013 closs: 0.4071 (0.5892) grad_norm: 2.6641 (3.3127) time: 5.7305 data: 0.0002 max mem: 71323 +[10:31:42.017823] Epoch: [1] [820/3112] lr: 0.000013 closs: 0.4040 (0.5893) grad_norm: 2.6641 (3.3025) time: 5.7275 data: 0.0002 max mem: 71323 +[10:32:39.128564] Epoch: [1] [830/3112] lr: 0.000013 closs: 0.5282 (0.5887) grad_norm: 2.4810 (3.2895) time: 5.7201 data: 0.0002 max mem: 71323 +[10:33:36.369442] Epoch: [1] [840/3112] lr: 0.000013 closs: 0.5446 (0.5907) grad_norm: 2.6724 (3.2713) time: 5.7174 data: 0.0002 max mem: 71323 +[10:34:33.662500] Epoch: [1] [850/3112] lr: 0.000013 closs: 0.5569 (0.5916) grad_norm: 2.6724 (3.3730) time: 5.7266 data: 0.0002 max mem: 71323 +[10:35:30.862765] Epoch: [1] [860/3112] lr: 0.000013 closs: 0.6764 (0.5922) grad_norm: 2.6498 (3.3585) time: 5.7246 data: 0.0002 max mem: 71323 +[10:36:28.083971] Epoch: [1] [870/3112] lr: 0.000013 closs: 0.5108 (0.5910) grad_norm: 2.4810 (3.3472) time: 5.7209 data: 0.0002 max mem: 71323 +[10:37:25.248279] Epoch: [1] [880/3112] lr: 0.000012 closs: 0.4094 (0.5903) grad_norm: 2.6498 (3.3428) time: 5.7191 data: 0.0002 max mem: 71323 +[10:38:22.377388] Epoch: [1] [890/3112] lr: 0.000012 closs: 0.4666 (0.5906) grad_norm: 2.6498 (3.3474) time: 5.7146 data: 0.0002 max mem: 71323 +[10:39:19.626307] Epoch: [1] [900/3112] lr: 0.000012 closs: 0.4949 (0.5897) grad_norm: 2.6498 (3.3511) time: 5.7188 data: 0.0002 max mem: 71323 +[10:40:16.906025] Epoch: [1] [910/3112] lr: 0.000012 closs: 0.5034 (0.5907) grad_norm: 2.6714 (3.3417) time: 5.7263 data: 0.0002 max mem: 71323 +[10:41:14.233100] Epoch: [1] [920/3112] lr: 0.000012 closs: 0.4565 (0.5896) grad_norm: 2.6714 (3.3346) time: 5.7302 data: 0.0002 max mem: 71323 +[10:42:11.577461] Epoch: [1] [930/3112] lr: 0.000012 closs: 0.4489 (0.5907) grad_norm: 2.6498 (3.3365) time: 5.7334 data: 0.0002 max mem: 71323 +[10:43:09.013813] Epoch: [1] [940/3112] lr: 0.000012 closs: 0.5679 (0.5910) grad_norm: 2.9173 (3.3307) time: 5.7389 data: 0.0002 max mem: 71323 +[10:44:06.248088] Epoch: [1] [950/3112] lr: 0.000012 closs: 0.6039 (0.5917) grad_norm: 2.9173 (3.3249) time: 5.7334 data: 0.0002 max mem: 71323 +[10:45:03.409964] Epoch: [1] [960/3112] lr: 0.000012 closs: 0.4689 (0.5898) grad_norm: 2.8824 (3.3289) time: 5.7197 data: 0.0002 max mem: 71323 +[10:46:00.554259] Epoch: [1] [970/3112] lr: 0.000012 closs: 0.4153 (0.5887) grad_norm: 2.9173 (3.3301) time: 5.7152 data: 0.0002 max mem: 71323 +[10:46:57.869308] Epoch: [1] [980/3112] lr: 0.000012 closs: 0.4596 (0.5885) grad_norm: 2.7959 (3.3133) time: 5.7229 data: 0.0002 max mem: 71323 +[10:47:55.143955] Epoch: [1] [990/3112] lr: 0.000012 closs: 0.4499 (0.5861) grad_norm: 2.8824 (3.3161) time: 5.7294 data: 0.0002 max mem: 71323 +[10:48:52.318846] Epoch: [1] [1000/3112] lr: 0.000012 closs: 0.3549 (0.5854) grad_norm: 2.7959 (3.2914) time: 5.7223 data: 0.0002 max mem: 71323 +[10:49:49.606271] Epoch: [1] [1010/3112] lr: 0.000012 closs: 0.3908 (0.5848) grad_norm: 2.7959 (3.2882) time: 5.7229 data: 0.0002 max mem: 71323 +[10:50:46.806763] Epoch: [1] [1020/3112] lr: 0.000012 closs: 0.4478 (0.5856) grad_norm: 2.5258 (3.2734) time: 5.7242 data: 0.0002 max mem: 71323 +[10:51:43.957700] Epoch: [1] [1030/3112] lr: 0.000012 closs: 0.5542 (0.5870) grad_norm: 2.7959 (3.2757) time: 5.7174 data: 0.0002 max mem: 71323 +[10:52:41.253509] Epoch: [1] [1040/3112] lr: 0.000012 closs: 0.6394 (0.5893) grad_norm: 2.5258 (3.2677) time: 5.7222 data: 0.0002 max mem: 71323 +[10:53:38.462515] Epoch: [1] [1050/3112] lr: 0.000012 closs: 0.7351 (0.5919) grad_norm: 2.5258 (3.2936) time: 5.7251 data: 0.0002 max mem: 71323 +[10:54:35.626181] Epoch: [1] [1060/3112] lr: 0.000012 closs: 0.7025 (0.5912) grad_norm: 2.5258 (3.2843) time: 5.7185 data: 0.0002 max mem: 71323 +[10:55:32.786241] Epoch: [1] [1070/3112] lr: 0.000012 closs: 0.5212 (0.5917) grad_norm: 2.4397 (3.2826) time: 5.7160 data: 0.0002 max mem: 71323 +[10:56:30.250345] Epoch: [1] [1080/3112] lr: 0.000012 closs: 0.5292 (0.5924) grad_norm: 2.5258 (3.2694) time: 5.7310 data: 0.0002 max mem: 71323 +[10:57:27.632078] Epoch: [1] [1090/3112] lr: 0.000012 closs: 0.4530 (0.5908) grad_norm: 2.4397 (3.2631) time: 5.7422 data: 0.0002 max mem: 71323 +[10:58:24.783088] Epoch: [1] [1100/3112] lr: 0.000012 closs: 0.4454 (0.5931) grad_norm: 2.6931 (3.2596) time: 5.7265 data: 0.0002 max mem: 71323 +[10:59:21.991752] Epoch: [1] [1110/3112] lr: 0.000012 closs: 0.4928 (0.5931) grad_norm: 2.3288 (3.2479) time: 5.7178 data: 0.0002 max mem: 71323 +[11:00:19.108220] Epoch: [1] [1120/3112] lr: 0.000012 closs: 0.5569 (0.5935) grad_norm: 2.4397 (3.2474) time: 5.7161 data: 0.0002 max mem: 71323 +[11:01:16.402525] Epoch: [1] [1130/3112] lr: 0.000012 closs: 0.5298 (0.5922) grad_norm: 2.2847 (3.2378) time: 5.7204 data: 0.0002 max mem: 71323 +[11:02:13.676738] Epoch: [1] [1140/3112] lr: 0.000012 closs: 0.3138 (0.5895) grad_norm: 2.1500 (3.2298) time: 5.7283 data: 0.0002 max mem: 71323 +[11:03:10.912988] Epoch: [1] [1150/3112] lr: 0.000012 closs: 0.4097 (0.5897) grad_norm: 2.4692 (3.2332) time: 5.7253 data: 0.0002 max mem: 71323 +[11:04:08.082352] Epoch: [1] [1160/3112] lr: 0.000012 closs: 0.5033 (0.5885) grad_norm: 2.5096 (3.2274) time: 5.7201 data: 0.0002 max mem: 71323 +[11:05:05.376220] Epoch: [1] [1170/3112] lr: 0.000012 closs: 0.3751 (0.5876) grad_norm: 2.5096 (3.2198) time: 5.7230 data: 0.0002 max mem: 71323 +[11:06:02.677738] Epoch: [1] [1180/3112] lr: 0.000011 closs: 0.3893 (0.5869) grad_norm: 2.4692 (3.2068) time: 5.7296 data: 0.0002 max mem: 71323 +[11:06:59.907334] Epoch: [1] [1190/3112] lr: 0.000011 closs: 0.4778 (0.5870) grad_norm: 2.5218 (3.2075) time: 5.7264 data: 0.0002 max mem: 71323 +[11:07:57.318091] Epoch: [1] [1200/3112] lr: 0.000011 closs: 0.5204 (0.5879) grad_norm: 2.6057 (3.2079) time: 5.7318 data: 0.0002 max mem: 71323 +[11:08:54.620193] Epoch: [1] [1210/3112] lr: 0.000011 closs: 0.4760 (0.5876) grad_norm: 2.6057 (3.2104) time: 5.7355 data: 0.0002 max mem: 71323 +[11:09:51.846111] Epoch: [1] [1220/3112] lr: 0.000011 closs: 0.4245 (0.5868) grad_norm: 2.5096 (3.1935) time: 5.7262 data: 0.0002 max mem: 71323 +[11:10:49.054796] Epoch: [1] [1230/3112] lr: 0.000011 closs: 0.5212 (0.5883) grad_norm: 2.6057 (3.1975) time: 5.7216 data: 0.0002 max mem: 71323 +[11:11:46.526720] Epoch: [1] [1240/3112] lr: 0.000011 closs: 0.6609 (0.5894) grad_norm: 2.6077 (3.1989) time: 5.7339 data: 0.0002 max mem: 71323 +[11:12:43.813824] Epoch: [1] [1250/3112] lr: 0.000011 closs: 0.6556 (0.5900) grad_norm: 2.6291 (3.2022) time: 5.7378 data: 0.0002 max mem: 71323 +[11:13:41.088300] Epoch: [1] [1260/3112] lr: 0.000011 closs: 0.6556 (0.5917) grad_norm: 3.0648 (3.2045) time: 5.7280 data: 0.0002 max mem: 71323 +[11:14:38.493865] Epoch: [1] [1270/3112] lr: 0.000011 closs: 0.4273 (0.5907) grad_norm: 2.6077 (3.1941) time: 5.7339 data: 0.0002 max mem: 71323 +[11:15:35.673348] Epoch: [1] [1280/3112] lr: 0.000011 closs: 0.4905 (0.5913) grad_norm: 2.6077 (3.2059) time: 5.7291 data: 0.0002 max mem: 71323 +[11:16:32.990633] Epoch: [1] [1290/3112] lr: 0.000011 closs: 0.5486 (0.5933) grad_norm: 2.6291 (3.2151) time: 5.7247 data: 0.0002 max mem: 71323 +[11:17:30.301797] Epoch: [1] [1300/3112] lr: 0.000011 closs: 0.7227 (0.5933) grad_norm: 3.0276 (3.2079) time: 5.7313 data: 0.0002 max mem: 71323 +[11:18:27.484038] Epoch: [1] [1310/3112] lr: 0.000011 closs: 0.4671 (0.5927) grad_norm: 2.6291 (3.2069) time: 5.7245 data: 0.0002 max mem: 71323 +[11:19:24.626984] Epoch: [1] [1320/3112] lr: 0.000011 closs: 0.4671 (0.5924) grad_norm: 2.6291 (3.1977) time: 5.7162 data: 0.0002 max mem: 71323 +[11:20:21.813622] Epoch: [1] [1330/3112] lr: 0.000011 closs: 0.4525 (0.5918) grad_norm: 2.3149 (3.1933) time: 5.7164 data: 0.0002 max mem: 71323 +[11:21:19.048041] Epoch: [1] [1340/3112] lr: 0.000011 closs: 0.4914 (0.5918) grad_norm: 2.3149 (3.1969) time: 5.7209 data: 0.0001 max mem: 71323 +[11:22:16.151934] Epoch: [1] [1350/3112] lr: 0.000011 closs: 0.5367 (0.5911) grad_norm: 3.0276 (3.2046) time: 5.7168 data: 0.0001 max mem: 71323 +[11:23:13.341788] Epoch: [1] [1360/3112] lr: 0.000011 closs: 0.5309 (0.5914) grad_norm: 3.2565 (3.2137) time: 5.7145 data: 0.0002 max mem: 71323 +[11:24:10.444125] Epoch: [1] [1370/3112] lr: 0.000011 closs: 0.5309 (0.5922) grad_norm: 2.8857 (3.2104) time: 5.7144 data: 0.0002 max mem: 71323 +[11:25:07.631412] Epoch: [1] [1380/3112] lr: 0.000011 closs: 0.4694 (0.5913) grad_norm: 2.6724 (3.2021) time: 5.7143 data: 0.0002 max mem: 71323 +[11:26:04.875708] Epoch: [1] [1390/3112] lr: 0.000011 closs: 0.4255 (0.5906) grad_norm: 2.7996 (3.2022) time: 5.7215 data: 0.0002 max mem: 71323 +[11:27:02.203371] Epoch: [1] [1400/3112] lr: 0.000011 closs: 0.3712 (0.5897) grad_norm: 2.8857 (3.1998) time: 5.7285 data: 0.0002 max mem: 71323 +[11:27:59.445357] Epoch: [1] [1410/3112] lr: 0.000011 closs: 0.4590 (0.5905) grad_norm: 3.1312 (3.2088) time: 5.7283 data: 0.0002 max mem: 71323 +[11:28:56.662186] Epoch: [1] [1420/3112] lr: 0.000011 closs: 0.5930 (0.5920) grad_norm: 2.8857 (3.2133) time: 5.7228 data: 0.0002 max mem: 71323 +[11:29:53.847010] Epoch: [1] [1430/3112] lr: 0.000011 closs: 0.7271 (0.5933) grad_norm: 2.8857 (3.2176) time: 5.7200 data: 0.0002 max mem: 71323 +[11:30:50.945922] Epoch: [1] [1440/3112] lr: 0.000011 closs: 0.6011 (0.5923) grad_norm: 2.7897 (3.2292) time: 5.7141 data: 0.0002 max mem: 71323 +[11:31:48.177318] Epoch: [1] [1450/3112] lr: 0.000011 closs: 0.6011 (0.5927) grad_norm: 2.7996 (3.2342) time: 5.7164 data: 0.0002 max mem: 71323 +[11:32:45.573178] Epoch: [1] [1460/3112] lr: 0.000011 closs: 0.6192 (0.5933) grad_norm: 3.1312 (3.2410) time: 5.7312 data: 0.0002 max mem: 71323 +[11:33:42.732144] Epoch: [1] [1470/3112] lr: 0.000011 closs: 0.4277 (0.5918) grad_norm: 2.9557 (3.2303) time: 5.7276 data: 0.0002 max mem: 71323 +[11:34:39.836379] Epoch: [1] [1480/3112] lr: 0.000010 closs: 0.4764 (0.5918) grad_norm: 2.7897 (3.2387) time: 5.7130 data: 0.0002 max mem: 71323 +[11:35:37.066448] Epoch: [1] [1490/3112] lr: 0.000010 closs: 0.4876 (0.5913) grad_norm: 2.7897 (3.2528) time: 5.7166 data: 0.0002 max mem: 71323 +[11:36:34.301843] Epoch: [1] [1500/3112] lr: 0.000010 closs: 0.5181 (0.5913) grad_norm: 2.5262 (3.2468) time: 5.7232 data: 0.0002 max mem: 71323 +[11:37:31.497365] Epoch: [1] [1510/3112] lr: 0.000010 closs: 0.4729 (0.5905) grad_norm: 2.5262 (3.2451) time: 5.7214 data: 0.0002 max mem: 71323 +[11:38:28.641847] Epoch: [1] [1520/3112] lr: 0.000010 closs: 0.4301 (0.5902) grad_norm: 2.9451 (3.2445) time: 5.7169 data: 0.0002 max mem: 71323 +[11:39:25.917787] Epoch: [1] [1530/3112] lr: 0.000010 closs: 0.5605 (0.5910) grad_norm: 2.9451 (3.2567) time: 5.7209 data: 0.0002 max mem: 71323 +[11:40:23.310267] Epoch: [1] [1540/3112] lr: 0.000010 closs: 0.4657 (0.5897) grad_norm: 2.9451 (3.2591) time: 5.7333 data: 0.0002 max mem: 71323 +[11:41:20.558510] Epoch: [1] [1550/3112] lr: 0.000010 closs: 0.4749 (0.5898) grad_norm: 3.6296 (3.2614) time: 5.7319 data: 0.0002 max mem: 71323 +[11:42:17.900875] Epoch: [1] [1560/3112] lr: 0.000010 closs: 0.5047 (0.5901) grad_norm: 3.6296 (3.2752) time: 5.7294 data: 0.0002 max mem: 71323 +[11:43:15.253469] Epoch: [1] [1570/3112] lr: 0.000010 closs: 0.4256 (0.5894) grad_norm: 3.2455 (3.2834) time: 5.7346 data: 0.0002 max mem: 71323 +[11:44:12.357870] Epoch: [1] [1580/3112] lr: 0.000010 closs: 0.4374 (0.5902) grad_norm: 3.2455 (3.2768) time: 5.7227 data: 0.0002 max mem: 71323 +[11:45:09.566234] Epoch: [1] [1590/3112] lr: 0.000010 closs: 0.4374 (0.5893) grad_norm: 3.2455 (3.2798) time: 5.7155 data: 0.0002 max mem: 71323 +[11:46:06.816539] Epoch: [1] [1600/3112] lr: 0.000010 closs: 0.4654 (0.5886) grad_norm: 3.2971 (3.2803) time: 5.7228 data: 0.0002 max mem: 71323 +[11:47:04.157505] Epoch: [1] [1610/3112] lr: 0.000010 closs: 0.4957 (0.5890) grad_norm: 3.1038 (3.2721) time: 5.7294 data: 0.0002 max mem: 71323 +[11:48:01.504317] Epoch: [1] [1620/3112] lr: 0.000010 closs: 0.5620 (0.5890) grad_norm: 3.0546 (3.2695) time: 5.7342 data: 0.0002 max mem: 71323 +[11:48:58.642889] Epoch: [1] [1630/3112] lr: 0.000010 closs: 0.6100 (0.5887) grad_norm: 2.0007 (3.2608) time: 5.7241 data: 0.0002 max mem: 71323 +[11:49:55.862927] Epoch: [1] [1640/3112] lr: 0.000010 closs: 0.4451 (0.5881) grad_norm: 2.2261 (3.2683) time: 5.7178 data: 0.0002 max mem: 71323 +[11:50:53.064803] Epoch: [1] [1650/3112] lr: 0.000010 closs: 0.4269 (0.5893) grad_norm: 2.0007 (3.2623) time: 5.7209 data: 0.0002 max mem: 71323 +[11:51:50.267282] Epoch: [1] [1660/3112] lr: 0.000010 closs: 0.4269 (0.5885) grad_norm: 2.2261 (3.2666) time: 5.7200 data: 0.0002 max mem: 71323 +[11:52:47.415142] Epoch: [1] [1670/3112] lr: 0.000010 closs: 0.5824 (0.5894) grad_norm: 2.7627 (3.2681) time: 5.7174 data: 0.0002 max mem: 71323 +[11:53:44.734202] Epoch: [1] [1680/3112] lr: 0.000010 closs: 0.5969 (0.5892) grad_norm: 2.7627 (3.2874) time: 5.7232 data: 0.0002 max mem: 71323 +[11:54:41.952519] Epoch: [1] [1690/3112] lr: 0.000010 closs: 0.3518 (0.5886) grad_norm: 2.7627 (3.2793) time: 5.7267 data: 0.0002 max mem: 71323 +[11:55:39.151299] Epoch: [1] [1700/3112] lr: 0.000010 closs: 0.3408 (0.5879) grad_norm: 2.7627 (3.2702) time: 5.7207 data: 0.0002 max mem: 71323 +[11:56:36.474373] Epoch: [1] [1710/3112] lr: 0.000010 closs: 0.4531 (0.5872) grad_norm: 2.8075 (3.2677) time: 5.7259 data: 0.0002 max mem: 71323 +[11:57:33.724227] Epoch: [1] [1720/3112] lr: 0.000010 closs: 0.5744 (0.5883) grad_norm: 2.8075 (3.2729) time: 5.7285 data: 0.0002 max mem: 71323 +[11:58:30.968599] Epoch: [1] [1730/3112] lr: 0.000010 closs: 0.6342 (0.5876) grad_norm: 3.1751 (3.2690) time: 5.7246 data: 0.0002 max mem: 71323 +[11:59:28.232892] Epoch: [1] [1740/3112] lr: 0.000010 closs: 0.4218 (0.5876) grad_norm: 2.8075 (3.2663) time: 5.7253 data: 0.0002 max mem: 71323 +[12:00:25.410567] Epoch: [1] [1750/3112] lr: 0.000010 closs: 0.4215 (0.5868) grad_norm: 2.5246 (3.2613) time: 5.7219 data: 0.0002 max mem: 71323 +[12:01:22.605271] Epoch: [1] [1760/3112] lr: 0.000010 closs: 0.4215 (0.5888) grad_norm: 2.5246 (3.2572) time: 5.7185 data: 0.0002 max mem: 71323 +[12:02:19.841294] Epoch: [1] [1770/3112] lr: 0.000009 closs: 0.8454 (0.5900) grad_norm: 2.6557 (3.2557) time: 5.7214 data: 0.0002 max mem: 71323 +[12:03:17.342061] Epoch: [1] [1780/3112] lr: 0.000009 closs: 0.5844 (0.5902) grad_norm: 2.7213 (3.2587) time: 5.7367 data: 0.0002 max mem: 71323 +[12:04:14.528291] Epoch: [1] [1790/3112] lr: 0.000009 closs: 0.5288 (0.5905) grad_norm: 3.1751 (3.2697) time: 5.7342 data: 0.0002 max mem: 71323 +[12:05:11.735538] Epoch: [1] [1800/3112] lr: 0.000009 closs: 0.6921 (0.5929) grad_norm: 3.0689 (3.2683) time: 5.7195 data: 0.0002 max mem: 71323 +[12:06:09.040325] Epoch: [1] [1810/3112] lr: 0.000009 closs: 0.5951 (0.5927) grad_norm: 2.9554 (3.2591) time: 5.7255 data: 0.0002 max mem: 71323 +[12:07:06.189129] Epoch: [1] [1820/3112] lr: 0.000009 closs: 0.4475 (0.5912) grad_norm: 2.7213 (3.2606) time: 5.7226 data: 0.0002 max mem: 71323 +[12:08:03.467195] Epoch: [1] [1830/3112] lr: 0.000009 closs: 0.3754 (0.5910) grad_norm: 2.6557 (3.2538) time: 5.7212 data: 0.0002 max mem: 71323 +[12:09:00.673280] Epoch: [1] [1840/3112] lr: 0.000009 closs: 0.4733 (0.5922) grad_norm: 2.9554 (3.3133) time: 5.7241 data: 0.0002 max mem: 71323 +[12:09:57.887053] Epoch: [1] [1850/3112] lr: 0.000009 closs: 0.7336 (0.5926) grad_norm: 2.9554 (3.3109) time: 5.7208 data: 0.0002 max mem: 71323 +[12:10:55.055840] Epoch: [1] [1860/3112] lr: 0.000009 closs: 0.5516 (0.5932) grad_norm: 2.9554 (3.3582) time: 5.7189 data: 0.0002 max mem: 71323 +[12:11:52.275261] Epoch: [1] [1870/3112] lr: 0.000009 closs: 0.5572 (0.5937) grad_norm: 2.7037 (3.3586) time: 5.7192 data: 0.0002 max mem: 71323 +[12:12:49.681255] Epoch: [1] [1880/3112] lr: 0.000009 closs: 0.5448 (0.5940) grad_norm: 2.5961 (3.3559) time: 5.7311 data: 0.0002 max mem: 71323 +[12:13:46.957072] Epoch: [1] [1890/3112] lr: 0.000009 closs: 0.7026 (0.5956) grad_norm: 2.7037 (3.3679) time: 5.7340 data: 0.0002 max mem: 71323 +[12:14:44.093408] Epoch: [1] [1900/3112] lr: 0.000009 closs: 0.7545 (0.5953) grad_norm: 2.7037 (3.3575) time: 5.7205 data: 0.0002 max mem: 71323 +[12:15:41.300479] Epoch: [1] [1910/3112] lr: 0.000009 closs: 0.4249 (0.5951) grad_norm: 2.7037 (3.3512) time: 5.7170 data: 0.0002 max mem: 71323 +[12:16:38.566453] Epoch: [1] [1920/3112] lr: 0.000009 closs: 0.4820 (0.5949) grad_norm: 2.7037 (3.3449) time: 5.7235 data: 0.0002 max mem: 71323 +[12:17:35.797466] Epoch: [1] [1930/3112] lr: 0.000009 closs: 0.6097 (0.5963) grad_norm: 2.7463 (3.3464) time: 5.7247 data: 0.0002 max mem: 71323 +[12:18:33.101881] Epoch: [1] [1940/3112] lr: 0.000009 closs: 0.6386 (0.5970) grad_norm: 2.7037 (3.3383) time: 5.7266 data: 0.0002 max mem: 71323 +[12:19:30.262181] Epoch: [1] [1950/3112] lr: 0.000009 closs: 0.6184 (0.5979) grad_norm: 1.9001 (3.3369) time: 5.7231 data: 0.0002 max mem: 71323 +[12:20:27.467755] Epoch: [1] [1960/3112] lr: 0.000009 closs: 0.6184 (0.5978) grad_norm: 2.4307 (3.3439) time: 5.7181 data: 0.0002 max mem: 71323 +[12:21:24.729924] Epoch: [1] [1970/3112] lr: 0.000009 closs: 0.6336 (0.5989) grad_norm: 2.4307 (3.3395) time: 5.7232 data: 0.0002 max mem: 71323 +[12:22:21.852108] Epoch: [1] [1980/3112] lr: 0.000009 closs: 0.6551 (0.5989) grad_norm: 2.7463 (3.3457) time: 5.7190 data: 0.0002 max mem: 71323 +[12:23:18.987374] Epoch: [1] [1990/3112] lr: 0.000009 closs: 0.3943 (0.5976) grad_norm: 2.7463 (3.3393) time: 5.7127 data: 0.0002 max mem: 71323 +[12:24:16.160866] Epoch: [1] [2000/3112] lr: 0.000009 closs: 0.3285 (0.5970) grad_norm: 2.6790 (3.3361) time: 5.7153 data: 0.0002 max mem: 71323 +[12:25:13.471043] Epoch: [1] [2010/3112] lr: 0.000009 closs: 0.5321 (0.5976) grad_norm: 2.6093 (3.3394) time: 5.7240 data: 0.0002 max mem: 71323 +[12:26:10.749234] Epoch: [1] [2020/3112] lr: 0.000009 closs: 0.5589 (0.5970) grad_norm: 2.4307 (3.3284) time: 5.7292 data: 0.0002 max mem: 71323 +[12:27:07.904340] Epoch: [1] [2030/3112] lr: 0.000009 closs: 0.4451 (0.5969) grad_norm: 2.4307 (3.3485) time: 5.7215 data: 0.0002 max mem: 71323 +[12:28:05.456536] Epoch: [1] [2040/3112] lr: 0.000009 closs: 0.4698 (0.5960) grad_norm: 2.6093 (3.3932) time: 5.7352 data: 0.0002 max mem: 71323 +[12:29:02.656736] Epoch: [1] [2050/3112] lr: 0.000009 closs: 0.5650 (0.5979) grad_norm: 2.8340 (3.3939) time: 5.7375 data: 0.0002 max mem: 71323 +[12:29:59.957091] Epoch: [1] [2060/3112] lr: 0.000008 closs: 0.6246 (0.5973) grad_norm: 3.1980 (3.4077) time: 5.7249 data: 0.0002 max mem: 71323 +[12:30:57.177863] Epoch: [1] [2070/3112] lr: 0.000008 closs: 0.4285 (0.5969) grad_norm: 3.4748 (3.4070) time: 5.7259 data: 0.0002 max mem: 71323 +[12:31:54.356024] Epoch: [1] [2080/3112] lr: 0.000008 closs: 0.4701 (0.5965) grad_norm: 3.1980 (3.4018) time: 5.7198 data: 0.0002 max mem: 71323 +[12:32:51.470639] Epoch: [1] [2090/3112] lr: 0.000008 closs: 0.4846 (0.5965) grad_norm: 3.4776 (3.4022) time: 5.7144 data: 0.0002 max mem: 71323 +[12:33:48.871540] Epoch: [1] [2100/3112] lr: 0.000008 closs: 0.5748 (0.5964) grad_norm: 3.4776 (3.3957) time: 5.7256 data: 0.0002 max mem: 71323 +[12:34:46.059308] Epoch: [1] [2110/3112] lr: 0.000008 closs: 0.5776 (0.5960) grad_norm: 3.5306 (3.4058) time: 5.7293 data: 0.0002 max mem: 71323 +[12:35:43.351339] Epoch: [1] [2120/3112] lr: 0.000008 closs: 0.5384 (0.5953) grad_norm: 3.1438 (3.3959) time: 5.7239 data: 0.0002 max mem: 71323 +[12:36:40.656606] Epoch: [1] [2130/3112] lr: 0.000008 closs: 0.5224 (0.5955) grad_norm: 3.1402 (3.3958) time: 5.7298 data: 0.0002 max mem: 71323 +[12:37:37.892044] Epoch: [1] [2140/3112] lr: 0.000008 closs: 0.6531 (0.5963) grad_norm: 2.7086 (3.3917) time: 5.7269 data: 0.0002 max mem: 71323 +[12:38:35.041428] Epoch: [1] [2150/3112] lr: 0.000008 closs: 0.5073 (0.5958) grad_norm: 2.7086 (3.4039) time: 5.7191 data: 0.0002 max mem: 71323 +[12:39:32.406568] Epoch: [1] [2160/3112] lr: 0.000008 closs: 0.5021 (0.5956) grad_norm: 2.6556 (3.3951) time: 5.7256 data: 0.0001 max mem: 71323 +[12:40:29.561408] Epoch: [1] [2170/3112] lr: 0.000008 closs: 0.5687 (0.5955) grad_norm: 2.4673 (3.3940) time: 5.7258 data: 0.0002 max mem: 71323 +[12:41:26.793861] Epoch: [1] [2180/3112] lr: 0.000008 closs: 0.5578 (0.5958) grad_norm: 2.4673 (3.3883) time: 5.7192 data: 0.0002 max mem: 71323 +[12:42:24.069189] Epoch: [1] [2190/3112] lr: 0.000008 closs: 0.4161 (0.5961) grad_norm: 2.4673 (3.3897) time: 5.7252 data: 0.0002 max mem: 71323 +[12:43:21.355468] Epoch: [1] [2200/3112] lr: 0.000008 closs: 0.3466 (0.5955) grad_norm: 2.6556 (3.3892) time: 5.7279 data: 0.0002 max mem: 71323 +[12:44:18.722056] Epoch: [1] [2210/3112] lr: 0.000008 closs: 0.5317 (0.5961) grad_norm: 2.8544 (3.3896) time: 5.7325 data: 0.0002 max mem: 71323 +[12:45:15.947045] Epoch: [1] [2220/3112] lr: 0.000008 closs: 0.5568 (0.5953) grad_norm: 2.8363 (3.3861) time: 5.7294 data: 0.0002 max mem: 71323 +[12:46:13.162124] Epoch: [1] [2230/3112] lr: 0.000008 closs: 0.4908 (0.5956) grad_norm: 2.7306 (3.3831) time: 5.7219 data: 0.0002 max mem: 71323 +[12:47:10.433450] Epoch: [1] [2240/3112] lr: 0.000008 closs: 0.4908 (0.5952) grad_norm: 2.7306 (3.3800) time: 5.7242 data: 0.0002 max mem: 71323 +[12:48:07.647316] Epoch: [1] [2250/3112] lr: 0.000008 closs: 0.6278 (0.5955) grad_norm: 2.7306 (3.3859) time: 5.7241 data: 0.0002 max mem: 71323 +[12:49:04.948119] Epoch: [1] [2260/3112] lr: 0.000008 closs: 0.5510 (0.5947) grad_norm: 2.7950 (3.3876) time: 5.7256 data: 0.0002 max mem: 71323 +[12:50:02.219073] Epoch: [1] [2270/3112] lr: 0.000008 closs: 0.4686 (0.5949) grad_norm: 2.7306 (3.3881) time: 5.7284 data: 0.0002 max mem: 71323 +[12:50:59.617042] Epoch: [1] [2280/3112] lr: 0.000008 closs: 0.6189 (0.5948) grad_norm: 2.7306 (3.3836) time: 5.7333 data: 0.0002 max mem: 71323 +[12:51:56.791704] Epoch: [1] [2290/3112] lr: 0.000008 closs: 0.3398 (0.5937) grad_norm: 2.5547 (3.3775) time: 5.7285 data: 0.0002 max mem: 71323 +[12:52:54.067303] Epoch: [1] [2300/3112] lr: 0.000008 closs: 0.3655 (0.5935) grad_norm: 2.5076 (3.3752) time: 5.7224 data: 0.0002 max mem: 71323 +[12:53:51.201662] Epoch: [1] [2310/3112] lr: 0.000008 closs: 0.5810 (0.5939) grad_norm: 2.5789 (3.3742) time: 5.7204 data: 0.0001 max mem: 71323 +[12:54:48.485091] Epoch: [1] [2320/3112] lr: 0.000008 closs: 0.5379 (0.5948) grad_norm: 2.5316 (3.3818) time: 5.7208 data: 0.0001 max mem: 71323 +[12:55:45.639381] Epoch: [1] [2330/3112] lr: 0.000008 closs: 0.4977 (0.5955) grad_norm: 2.5316 (3.3787) time: 5.7217 data: 0.0002 max mem: 71323 +[12:56:42.934688] Epoch: [1] [2340/3112] lr: 0.000008 closs: 0.6554 (0.5955) grad_norm: 2.5316 (3.3800) time: 5.7223 data: 0.0002 max mem: 71323 +[12:57:40.215574] Epoch: [1] [2350/3112] lr: 0.000008 closs: 0.5161 (0.5954) grad_norm: 2.5316 (3.3788) time: 5.7287 data: 0.0002 max mem: 71323 +[12:58:37.630204] Epoch: [1] [2360/3112] lr: 0.000007 closs: 0.4461 (0.5949) grad_norm: 2.3602 (3.3665) time: 5.7346 data: 0.0002 max mem: 71323 +[12:59:34.927666] Epoch: [1] [2370/3112] lr: 0.000007 closs: 0.3224 (0.5944) grad_norm: 2.5316 (3.3760) time: 5.7354 data: 0.0002 max mem: 71323 +[13:00:32.136918] Epoch: [1] [2380/3112] lr: 0.000007 closs: 0.4442 (0.5948) grad_norm: 2.5316 (3.3804) time: 5.7252 data: 0.0002 max mem: 71323 +[13:01:29.440826] Epoch: [1] [2390/3112] lr: 0.000007 closs: 0.5976 (0.5944) grad_norm: 2.2028 (3.3783) time: 5.7255 data: 0.0002 max mem: 71323 +[13:02:26.575804] Epoch: [1] [2400/3112] lr: 0.000007 closs: 0.5502 (0.5937) grad_norm: 1.9132 (3.3785) time: 5.7218 data: 0.0002 max mem: 71323 +[13:03:23.765802] Epoch: [1] [2410/3112] lr: 0.000007 closs: 0.5240 (0.5936) grad_norm: 2.3872 (3.4208) time: 5.7161 data: 0.0002 max mem: 71323 +[13:04:21.098935] Epoch: [1] [2420/3112] lr: 0.000007 closs: 0.4521 (0.5933) grad_norm: 2.4755 (3.4228) time: 5.7260 data: 0.0002 max mem: 71323 +[13:05:18.433205] Epoch: [1] [2430/3112] lr: 0.000007 closs: 0.5407 (0.5942) grad_norm: 2.4755 (3.4253) time: 5.7332 data: 0.0002 max mem: 71323 +[13:06:15.579293] Epoch: [1] [2440/3112] lr: 0.000007 closs: 0.5482 (0.5934) grad_norm: 2.4755 (3.4169) time: 5.7238 data: 0.0002 max mem: 71323 +[13:07:12.842807] Epoch: [1] [2450/3112] lr: 0.000007 closs: 0.3710 (0.5936) grad_norm: 2.4755 (3.4133) time: 5.7203 data: 0.0002 max mem: 71323 +[13:08:10.124383] Epoch: [1] [2460/3112] lr: 0.000007 closs: 0.4706 (0.5935) grad_norm: 2.1906 (3.4068) time: 5.7271 data: 0.0002 max mem: 71323 +[13:09:07.307918] Epoch: [1] [2470/3112] lr: 0.000007 closs: 0.6117 (0.5939) grad_norm: 2.1906 (3.4046) time: 5.7231 data: 0.0002 max mem: 71323 +[13:10:04.498184] Epoch: [1] [2480/3112] lr: 0.000007 closs: 0.5846 (0.5939) grad_norm: 2.3872 (3.4108) time: 5.7186 data: 0.0002 max mem: 71323 +[13:11:01.662731] Epoch: [1] [2490/3112] lr: 0.000007 closs: 0.4590 (0.5936) grad_norm: 2.4755 (3.4087) time: 5.7176 data: 0.0002 max mem: 71323 +[13:11:58.920706] Epoch: [1] [2500/3112] lr: 0.000007 closs: 0.4590 (0.5933) grad_norm: 2.1239 (3.4017) time: 5.7210 data: 0.0002 max mem: 71323 +[13:12:56.147097] Epoch: [1] [2510/3112] lr: 0.000007 closs: 0.5225 (0.5935) grad_norm: 2.1239 (3.3982) time: 5.7241 data: 0.0002 max mem: 71323 +[13:13:53.558497] Epoch: [1] [2520/3112] lr: 0.000007 closs: 0.6081 (0.5933) grad_norm: 2.1239 (3.3869) time: 5.7318 data: 0.0002 max mem: 71323 +[13:14:50.888720] Epoch: [1] [2530/3112] lr: 0.000007 closs: 0.6044 (0.5929) grad_norm: 2.1239 (3.3824) time: 5.7370 data: 0.0002 max mem: 71323 +[13:15:48.254677] Epoch: [1] [2540/3112] lr: 0.000007 closs: 0.5823 (0.5935) grad_norm: 2.1897 (3.3783) time: 5.7346 data: 0.0002 max mem: 71323 +[13:16:45.475176] Epoch: [1] [2550/3112] lr: 0.000007 closs: 0.5102 (0.5931) grad_norm: 2.1620 (3.3736) time: 5.7292 data: 0.0002 max mem: 71323 +[13:17:42.644944] Epoch: [1] [2560/3112] lr: 0.000007 closs: 0.3917 (0.5924) grad_norm: 2.1123 (3.3711) time: 5.7194 data: 0.0002 max mem: 71323 +[13:18:39.819779] Epoch: [1] [2570/3112] lr: 0.000007 closs: 0.4010 (0.5923) grad_norm: 2.0445 (3.3686) time: 5.7171 data: 0.0002 max mem: 71323 +[13:19:37.024999] Epoch: [1] [2580/3112] lr: 0.000007 closs: 0.5028 (0.5928) grad_norm: 2.0445 (3.3646) time: 5.7189 data: 0.0002 max mem: 71323 +[13:20:34.258191] Epoch: [1] [2590/3112] lr: 0.000007 closs: 0.3090 (0.5926) grad_norm: 2.0445 (3.3605) time: 5.7217 data: 0.0002 max mem: 71323 +[13:21:31.585910] Epoch: [1] [2600/3112] lr: 0.000007 closs: 0.3090 (0.5920) grad_norm: 2.1620 (3.3549) time: 5.7279 data: 0.0002 max mem: 71323 +[13:22:28.843749] Epoch: [1] [2610/3112] lr: 0.000007 closs: 0.4656 (0.5928) grad_norm: 2.1956 (3.3556) time: 5.7291 data: 0.0002 max mem: 71323 +[13:23:26.382824] Epoch: [1] [2620/3112] lr: 0.000007 closs: 0.4347 (0.5925) grad_norm: 2.1123 (3.3517) time: 5.7397 data: 0.0002 max mem: 71323 +[13:24:23.556024] Epoch: [1] [2630/3112] lr: 0.000007 closs: 0.5205 (0.5933) grad_norm: 2.1956 (3.3468) time: 5.7355 data: 0.0002 max mem: 71323 +[13:25:20.796146] Epoch: [1] [2640/3112] lr: 0.000007 closs: 0.6251 (0.5934) grad_norm: 2.1956 (3.3446) time: 5.7205 data: 0.0002 max mem: 71323 +[13:26:17.847923] Epoch: [1] [2650/3112] lr: 0.000007 closs: 0.4358 (0.5926) grad_norm: 2.8057 (3.3465) time: 5.7144 data: 0.0002 max mem: 71323 +[13:27:15.085120] Epoch: [1] [2660/3112] lr: 0.000007 closs: 0.5810 (0.5933) grad_norm: 2.8057 (3.3486) time: 5.7143 data: 0.0002 max mem: 71323 +[13:28:12.201846] Epoch: [1] [2670/3112] lr: 0.000006 closs: 0.5782 (0.5942) grad_norm: 2.9163 (3.3500) time: 5.7175 data: 0.0002 max mem: 71323 +[13:29:09.482041] Epoch: [1] [2680/3112] lr: 0.000006 closs: 0.5222 (0.5946) grad_norm: 2.9163 (3.3513) time: 5.7197 data: 0.0002 max mem: 71323 +[13:30:06.935591] Epoch: [1] [2690/3112] lr: 0.000006 closs: 0.5222 (0.5944) grad_norm: 2.8412 (3.3509) time: 5.7366 data: 0.0002 max mem: 71323 +[13:31:04.053633] Epoch: [1] [2700/3112] lr: 0.000006 closs: 0.5555 (0.5948) grad_norm: 2.9613 (3.3646) time: 5.7285 data: 0.0002 max mem: 71323 +[13:32:01.184985] Epoch: [1] [2710/3112] lr: 0.000006 closs: 0.5376 (0.5939) grad_norm: 3.5938 (3.3658) time: 5.7123 data: 0.0002 max mem: 71323 +[13:32:58.500339] Epoch: [1] [2720/3112] lr: 0.000006 closs: 0.4558 (0.5937) grad_norm: 3.8010 (3.3688) time: 5.7222 data: 0.0002 max mem: 71323 +[13:33:55.754635] Epoch: [1] [2730/3112] lr: 0.000006 closs: 0.4563 (0.5942) grad_norm: 3.8010 (3.3688) time: 5.7283 data: 0.0002 max mem: 71323 +[13:34:53.083023] Epoch: [1] [2740/3112] lr: 0.000006 closs: 0.4563 (0.5951) grad_norm: 3.8744 (3.3708) time: 5.7290 data: 0.0002 max mem: 71323 +[13:35:50.368226] Epoch: [1] [2750/3112] lr: 0.000006 closs: 0.6571 (0.5967) grad_norm: 4.0016 (3.3723) time: 5.7306 data: 0.0002 max mem: 71323 +[13:36:47.566923] Epoch: [1] [2760/3112] lr: 0.000006 closs: 0.6401 (0.5969) grad_norm: 3.5938 (3.3686) time: 5.7241 data: 0.0002 max mem: 71323 +[13:37:44.784796] Epoch: [1] [2770/3112] lr: 0.000006 closs: 0.4982 (0.5972) grad_norm: 4.0016 (3.3736) time: 5.7207 data: 0.0002 max mem: 71323 +[13:38:42.032414] Epoch: [1] [2780/3112] lr: 0.000006 closs: 0.5479 (0.5972) grad_norm: 3.4739 (3.3706) time: 5.7232 data: 0.0002 max mem: 71323 +[13:39:39.258619] Epoch: [1] [2790/3112] lr: 0.000006 closs: 0.5479 (0.5968) grad_norm: 3.1176 (3.3670) time: 5.7236 data: 0.0002 max mem: 71323 +[13:40:36.387286] Epoch: [1] [2800/3112] lr: 0.000006 closs: 0.3827 (0.5963) grad_norm: 3.1176 (3.3742) time: 5.7177 data: 0.0002 max mem: 71323 +[13:41:33.515834] Epoch: [1] [2810/3112] lr: 0.000006 closs: 0.3538 (0.5958) grad_norm: 3.1176 (3.3721) time: 5.7128 data: 0.0002 max mem: 71323 +[13:42:30.660864] Epoch: [1] [2820/3112] lr: 0.000006 closs: 0.6189 (0.5966) grad_norm: 3.0800 (3.3655) time: 5.7136 data: 0.0002 max mem: 71323 +[13:43:27.936292] Epoch: [1] [2830/3112] lr: 0.000006 closs: 0.6189 (0.5972) grad_norm: 2.5449 (3.3605) time: 5.7209 data: 0.0002 max mem: 71323 +[13:44:25.340988] Epoch: [1] [2840/3112] lr: 0.000006 closs: 0.6008 (0.5978) grad_norm: 2.8261 (3.3598) time: 5.7339 data: 0.0002 max mem: 71323 +[13:45:22.723589] Epoch: [1] [2850/3112] lr: 0.000006 closs: 0.5505 (0.5975) grad_norm: 2.7837 (3.3596) time: 5.7392 data: 0.0002 max mem: 71323 +[13:46:19.918755] Epoch: [1] [2860/3112] lr: 0.000006 closs: 0.4604 (0.5970) grad_norm: 2.5449 (3.3553) time: 5.7286 data: 0.0002 max mem: 71323 +[13:47:17.087962] Epoch: [1] [2870/3112] lr: 0.000006 closs: 0.4973 (0.5971) grad_norm: 2.6828 (3.3531) time: 5.7179 data: 0.0002 max mem: 71323 +[13:48:14.364820] Epoch: [1] [2880/3112] lr: 0.000006 closs: 0.3841 (0.5961) grad_norm: 2.5449 (3.3543) time: 5.7222 data: 0.0002 max mem: 71323 +[13:49:11.625732] Epoch: [1] [2890/3112] lr: 0.000006 closs: 0.3523 (0.5954) grad_norm: 2.5449 (3.3566) time: 5.7268 data: 0.0002 max mem: 71323 +[13:50:08.857221] Epoch: [1] [2900/3112] lr: 0.000006 closs: 0.3739 (0.5946) grad_norm: 2.6828 (3.3563) time: 5.7245 data: 0.0002 max mem: 71323 +[13:51:06.039455] Epoch: [1] [2910/3112] lr: 0.000006 closs: 0.4970 (0.5954) grad_norm: 2.6828 (3.3505) time: 5.7206 data: 0.0002 max mem: 71323 +[13:52:03.227180] Epoch: [1] [2920/3112] lr: 0.000006 closs: 0.4924 (0.5953) grad_norm: 2.4712 (3.3503) time: 5.7184 data: 0.0002 max mem: 71323 +[13:53:00.414632] Epoch: [1] [2930/3112] lr: 0.000006 closs: 0.4924 (0.5953) grad_norm: 2.1076 (3.3464) time: 5.7187 data: 0.0002 max mem: 71323 +[13:53:57.642531] Epoch: [1] [2940/3112] lr: 0.000006 closs: 0.5576 (0.5952) grad_norm: 2.1076 (3.3432) time: 5.7207 data: 0.0002 max mem: 71323 +[13:54:54.812974] Epoch: [1] [2950/3112] lr: 0.000006 closs: 0.5219 (0.5951) grad_norm: 2.2410 (3.3409) time: 5.7198 data: 0.0002 max mem: 71323 +[13:55:51.986400] Epoch: [1] [2960/3112] lr: 0.000006 closs: 0.5302 (0.5954) grad_norm: 2.2410 (3.3426) time: 5.7170 data: 0.0002 max mem: 71323 +[13:56:49.161787] Epoch: [1] [2970/3112] lr: 0.000006 closs: 0.5302 (0.5952) grad_norm: 2.2410 (3.3401) time: 5.7173 data: 0.0002 max mem: 71323 +[13:57:46.386583] Epoch: [1] [2980/3112] lr: 0.000006 closs: 0.5034 (0.5958) grad_norm: 2.2410 (3.3348) time: 5.7199 data: 0.0002 max mem: 71323 +[13:58:43.521350] Epoch: [1] [2990/3112] lr: 0.000005 closs: 0.5034 (0.5951) grad_norm: 2.2632 (3.3320) time: 5.7178 data: 0.0002 max mem: 71323 +[13:59:40.926336] Epoch: [1] [3000/3112] lr: 0.000005 closs: 0.5936 (0.5957) grad_norm: 2.2494 (3.3270) time: 5.7268 data: 0.0002 max mem: 71323 +[14:00:38.238949] Epoch: [1] [3010/3112] lr: 0.000005 closs: 0.5449 (0.5950) grad_norm: 2.2911 (3.3388) time: 5.7357 data: 0.0002 max mem: 71323 +[14:01:35.508067] Epoch: [1] [3020/3112] lr: 0.000005 closs: 0.4764 (0.5949) grad_norm: 2.2632 (3.3334) time: 5.7290 data: 0.0002 max mem: 71323 +[14:02:32.696492] Epoch: [1] [3030/3112] lr: 0.000005 closs: 0.5640 (0.5948) grad_norm: 2.2494 (3.3323) time: 5.7228 data: 0.0002 max mem: 71323 +[14:03:29.903125] Epoch: [1] [3040/3112] lr: 0.000005 closs: 0.5597 (0.5950) grad_norm: 2.2494 (3.3297) time: 5.7196 data: 0.0002 max mem: 71323 +[14:04:27.145373] Epoch: [1] [3050/3112] lr: 0.000005 closs: 0.4935 (0.5950) grad_norm: 2.1169 (3.3253) time: 5.7223 data: 0.0002 max mem: 71323 +[14:05:24.592201] Epoch: [1] [3060/3112] lr: 0.000005 closs: 0.4388 (0.5944) grad_norm: 2.2494 (3.3205) time: 5.7343 data: 0.0002 max mem: 71323 +[14:06:21.714444] Epoch: [1] [3070/3112] lr: 0.000005 closs: 0.4388 (0.5941) grad_norm: 2.3191 (3.3199) time: 5.7283 data: 0.0002 max mem: 71323 +[14:07:18.875723] Epoch: [1] [3080/3112] lr: 0.000005 closs: 0.4525 (0.5946) grad_norm: 2.5830 (3.3189) time: 5.7140 data: 0.0002 max mem: 71323 +[14:08:16.055493] Epoch: [1] [3090/3112] lr: 0.000005 closs: 0.5828 (0.5946) grad_norm: 2.3191 (3.3157) time: 5.7169 data: 0.0002 max mem: 71323 +[14:09:13.282011] Epoch: [1] [3100/3112] lr: 0.000005 closs: 0.4670 (0.5939) grad_norm: 2.7254 (3.3163) time: 5.7201 data: 0.0002 max mem: 71323 +[14:10:10.525548] Epoch: [1] [3110/3112] lr: 0.000005 closs: 0.4383 (0.5942) grad_norm: 2.7254 (3.3160) time: 5.7233 data: 0.0001 max mem: 71323 +[14:10:16.590300] Epoch: [1] Total time: 4:56:55 +[14:10:16.646204] Averaged stats: lr: 0.000005 closs: 0.4383 (0.5820) grad_norm: 2.7254 (3.3142) +[14:10:16.823073] model saved +[14:10:18.586728] optimizer saved +[14:10:18.587367] other rank-common saved +[14:10:18.590942] rank-specific saved +[14:10:18.601086] log_dir: ./output_dir +[14:10:25.814869] Epoch: [2] [0/3112] lr: 0.000005 closs: 0.4360 (0.4360) time: 7.2128 data: 1.5226 max mem: 71323 +[14:11:23.029028] Epoch: [2] [10/3112] lr: 0.000005 closs: 0.6074 (0.6652) grad_norm: 1.1510 (1.7776) time: 5.8568 data: 0.1386 max mem: 71323 +[14:12:20.381583] Epoch: [2] [20/3112] lr: 0.000005 closs: 0.5463 (0.5534) grad_norm: 1.1541 (1.7179) time: 5.7282 data: 0.0002 max mem: 71323 +[14:13:17.726367] Epoch: [2] [30/3112] lr: 0.000005 closs: 0.4470 (0.5344) grad_norm: 1.4024 (1.8552) time: 5.7347 data: 0.0002 max mem: 71323 +[14:14:14.931247] Epoch: [2] [40/3112] lr: 0.000005 closs: 0.5431 (0.6070) grad_norm: 2.4042 (2.3449) time: 5.7274 data: 0.0002 max mem: 71323 +[14:15:12.313387] Epoch: [2] [50/3112] lr: 0.000005 closs: 0.5570 (0.5885) grad_norm: 1.4024 (2.1452) time: 5.7292 data: 0.0002 max mem: 71323 +[14:16:09.717658] Epoch: [2] [60/3112] lr: 0.000005 closs: 0.5216 (0.5955) grad_norm: 2.4042 (2.3341) time: 5.7391 data: 0.0002 max mem: 71323 +[14:17:06.965852] Epoch: [2] [70/3112] lr: 0.000005 closs: 0.4466 (0.5681) grad_norm: 2.4756 (2.4122) time: 5.7324 data: 0.0002 max mem: 71323 +[14:18:04.227048] Epoch: [2] [80/3112] lr: 0.000005 closs: 0.4193 (0.5739) grad_norm: 2.4756 (2.6078) time: 5.7253 data: 0.0002 max mem: 71323 +[14:19:01.531251] Epoch: [2] [90/3112] lr: 0.000005 closs: 0.4102 (0.5723) grad_norm: 2.9618 (2.7173) time: 5.7281 data: 0.0002 max mem: 71323 +[14:19:58.873607] Epoch: [2] [100/3112] lr: 0.000005 closs: 0.5055 (0.5931) grad_norm: 2.9618 (2.8619) time: 5.7322 data: 0.0002 max mem: 71323 +[14:20:56.179405] Epoch: [2] [110/3112] lr: 0.000005 closs: 0.5156 (0.5908) grad_norm: 2.9212 (2.7294) time: 5.7323 data: 0.0002 max mem: 71323 +[14:21:53.621774] Epoch: [2] [120/3112] lr: 0.000005 closs: 0.4414 (0.5978) grad_norm: 2.7157 (2.6279) time: 5.7373 data: 0.0002 max mem: 71323 +[14:22:50.788748] Epoch: [2] [130/3112] lr: 0.000005 closs: 0.5164 (0.6036) grad_norm: 2.9212 (2.7252) time: 5.7303 data: 0.0002 max mem: 71323 +[14:23:47.878249] Epoch: [2] [140/3112] lr: 0.000005 closs: 0.5542 (0.6015) grad_norm: 2.9523 (2.7765) time: 5.7127 data: 0.0002 max mem: 71323 +[14:24:45.096459] Epoch: [2] [150/3112] lr: 0.000005 closs: 0.5151 (0.6003) grad_norm: 2.9212 (2.7445) time: 5.7153 data: 0.0002 max mem: 71323 +[14:25:42.328890] Epoch: [2] [160/3112] lr: 0.000005 closs: 0.4948 (0.5991) grad_norm: 2.8809 (2.7850) time: 5.7223 data: 0.0002 max mem: 71323 +[14:26:39.604416] Epoch: [2] [170/3112] lr: 0.000005 closs: 0.4948 (0.5957) grad_norm: 2.7727 (2.7791) time: 5.7252 data: 0.0002 max mem: 71323 +[14:27:36.858102] Epoch: [2] [180/3112] lr: 0.000005 closs: 0.5062 (0.5940) grad_norm: 2.6414 (2.7271) time: 5.7263 data: 0.0002 max mem: 71323 +[14:28:34.083629] Epoch: [2] [190/3112] lr: 0.000005 closs: 0.3496 (0.5808) grad_norm: 2.8809 (2.7651) time: 5.7238 data: 0.0002 max mem: 71323 +[14:29:31.415923] Epoch: [2] [200/3112] lr: 0.000005 closs: 0.4586 (0.5830) grad_norm: 2.9501 (2.7475) time: 5.7278 data: 0.0002 max mem: 71323 +[14:30:28.740693] Epoch: [2] [210/3112] lr: 0.000005 closs: 0.6143 (0.5840) grad_norm: 2.9501 (2.7892) time: 5.7328 data: 0.0002 max mem: 71323 +[14:31:26.119725] Epoch: [2] [220/3112] lr: 0.000004 closs: 0.3812 (0.5742) grad_norm: 2.8809 (2.9147) time: 5.7351 data: 0.0002 max mem: 71323 +[14:32:23.303395] Epoch: [2] [230/3112] lr: 0.000004 closs: 0.2071 (0.5760) grad_norm: 3.0244 (2.9154) time: 5.7280 data: 0.0002 max mem: 71323 +[14:33:20.393140] Epoch: [2] [240/3112] lr: 0.000004 closs: 0.3576 (0.5692) grad_norm: 3.0556 (2.9465) time: 5.7135 data: 0.0002 max mem: 71323 +[14:34:17.543229] Epoch: [2] [250/3112] lr: 0.000004 closs: 0.4125 (0.5691) grad_norm: 3.0244 (2.9074) time: 5.7119 data: 0.0002 max mem: 71323 +[14:35:14.755843] Epoch: [2] [260/3112] lr: 0.000004 closs: 0.4664 (0.5705) grad_norm: 3.0556 (2.8683) time: 5.7180 data: 0.0002 max mem: 71323 +[14:36:11.903644] Epoch: [2] [270/3112] lr: 0.000004 closs: 0.4721 (0.5671) grad_norm: 3.3290 (2.8993) time: 5.7179 data: 0.0002 max mem: 71323 +[14:37:09.193778] Epoch: [2] [280/3112] lr: 0.000004 closs: 0.4722 (0.5675) grad_norm: 2.4226 (2.8360) time: 5.7218 data: 0.0002 max mem: 71323 +[14:38:06.290280] Epoch: [2] [290/3112] lr: 0.000004 closs: 0.4234 (0.5570) grad_norm: 2.4226 (2.8661) time: 5.7192 data: 0.0002 max mem: 71323 +[14:39:03.490528] Epoch: [2] [300/3112] lr: 0.000004 closs: 0.1751 (0.5527) grad_norm: 2.4226 (2.8397) time: 5.7147 data: 0.0002 max mem: 71323 +[14:40:00.658561] Epoch: [2] [310/3112] lr: 0.000004 closs: 0.2418 (0.5454) grad_norm: 1.8474 (2.7978) time: 5.7183 data: 0.0002 max mem: 71323 +[14:40:57.922081] Epoch: [2] [320/3112] lr: 0.000004 closs: 0.3416 (0.5422) grad_norm: 1.5908 (2.8122) time: 5.7214 data: 0.0002 max mem: 71323 +[14:41:55.093136] Epoch: [2] [330/3112] lr: 0.000004 closs: 0.4215 (0.5395) grad_norm: 1.8154 (2.8548) time: 5.7216 data: 0.0002 max mem: 71323 +[14:42:52.358864] Epoch: [2] [340/3112] lr: 0.000004 closs: 0.4013 (0.5360) grad_norm: 1.9718 (2.8178) time: 5.7217 data: 0.0002 max mem: 71323 +[14:43:49.594475] Epoch: [2] [350/3112] lr: 0.000004 closs: 0.4082 (0.5385) grad_norm: 1.5888 (2.7779) time: 5.7249 data: 0.0002 max mem: 71323 +[14:44:46.824311] Epoch: [2] [360/3112] lr: 0.000004 closs: 0.4371 (0.5413) grad_norm: 1.8154 (2.7447) time: 5.7231 data: 0.0001 max mem: 71323 +[14:45:44.127444] Epoch: [2] [370/3112] lr: 0.000004 closs: 0.5517 (0.5415) grad_norm: 1.8154 (2.7556) time: 5.7265 data: 0.0002 max mem: 71323 +[14:46:41.438547] Epoch: [2] [380/3112] lr: 0.000004 closs: 0.5517 (0.5438) grad_norm: 1.8154 (2.8606) time: 5.7306 data: 0.0002 max mem: 71323 +[14:47:38.612605] Epoch: [2] [390/3112] lr: 0.000004 closs: 0.5489 (0.5430) grad_norm: 1.9718 (2.8663) time: 5.7242 data: 0.0002 max mem: 71323 +[14:48:35.879547] Epoch: [2] [400/3112] lr: 0.000004 closs: 0.5836 (0.5418) grad_norm: 1.9718 (2.8590) time: 5.7219 data: 0.0002 max mem: 71323 +[14:49:33.140901] Epoch: [2] [410/3112] lr: 0.000004 closs: 0.5117 (0.5411) grad_norm: 1.9718 (2.8813) time: 5.7263 data: 0.0002 max mem: 71323 +[14:50:30.314098] Epoch: [2] [420/3112] lr: 0.000004 closs: 0.5141 (0.5412) grad_norm: 2.1015 (2.8810) time: 5.7216 data: 0.0002 max mem: 71323 +[14:51:27.392113] Epoch: [2] [430/3112] lr: 0.000004 closs: 0.5396 (0.5416) grad_norm: 3.1541 (2.9040) time: 5.7125 data: 0.0002 max mem: 71323 +[14:52:24.592936] Epoch: [2] [440/3112] lr: 0.000004 closs: 0.4379 (0.5379) grad_norm: 3.1541 (2.9065) time: 5.7138 data: 0.0003 max mem: 71323 +[14:53:21.717913] Epoch: [2] [450/3112] lr: 0.000004 closs: 0.3255 (0.5362) grad_norm: 2.5157 (2.8794) time: 5.7162 data: 0.0003 max mem: 71323 +[14:54:18.961814] Epoch: [2] [460/3112] lr: 0.000004 closs: 0.5022 (0.5395) grad_norm: 2.5157 (2.8955) time: 5.7183 data: 0.0002 max mem: 71323 +[14:55:16.243085] Epoch: [2] [470/3112] lr: 0.000004 closs: 0.5022 (0.5412) grad_norm: 2.5157 (2.8819) time: 5.7262 data: 0.0002 max mem: 71323 +[14:56:13.445829] Epoch: [2] [480/3112] lr: 0.000004 closs: 0.4838 (0.5410) grad_norm: 2.9418 (2.9094) time: 5.7241 data: 0.0001 max mem: 71323 +[14:57:10.678649] Epoch: [2] [490/3112] lr: 0.000004 closs: 0.5739 (0.5410) grad_norm: 2.9418 (2.9045) time: 5.7217 data: 0.0002 max mem: 71323 +[14:58:07.929596] Epoch: [2] [500/3112] lr: 0.000004 closs: 0.5736 (0.5435) grad_norm: 2.9418 (2.8888) time: 5.7241 data: 0.0002 max mem: 71323 +[14:59:05.135218] Epoch: [2] [510/3112] lr: 0.000004 closs: 0.7866 (0.5489) grad_norm: 2.5816 (2.8753) time: 5.7227 data: 0.0002 max mem: 71323 +[15:00:02.321225] Epoch: [2] [520/3112] lr: 0.000004 closs: 0.7884 (0.5538) grad_norm: 1.7227 (2.8745) time: 5.7195 data: 0.0001 max mem: 71323 +[15:00:59.659541] Epoch: [2] [530/3112] lr: 0.000004 closs: 0.4922 (0.5529) grad_norm: 2.5816 (2.8793) time: 5.7261 data: 0.0001 max mem: 71323 +[15:01:56.847167] Epoch: [2] [540/3112] lr: 0.000004 closs: 0.4445 (0.5541) grad_norm: 2.5816 (2.8715) time: 5.7261 data: 0.0001 max mem: 71323 +[15:02:54.143400] Epoch: [2] [550/3112] lr: 0.000004 closs: 0.5808 (0.5558) grad_norm: 1.9575 (2.8419) time: 5.7241 data: 0.0002 max mem: 71323 +[15:03:51.318059] Epoch: [2] [560/3112] lr: 0.000004 closs: 0.3878 (0.5517) grad_norm: 1.7227 (2.8361) time: 5.7234 data: 0.0002 max mem: 71323 +[15:04:48.567286] Epoch: [2] [570/3112] lr: 0.000004 closs: 0.3777 (0.5522) grad_norm: 1.9575 (2.8945) time: 5.7210 data: 0.0002 max mem: 71323 +[15:05:45.749204] Epoch: [2] [580/3112] lr: 0.000004 closs: 0.5287 (0.5542) grad_norm: 2.5816 (2.9065) time: 5.7214 data: 0.0001 max mem: 71323 +[15:06:42.926418] Epoch: [2] [590/3112] lr: 0.000003 closs: 0.3925 (0.5520) grad_norm: 2.6845 (2.9114) time: 5.7178 data: 0.0001 max mem: 71323 +[15:07:40.102990] Epoch: [2] [600/3112] lr: 0.000003 closs: 0.2956 (0.5509) grad_norm: 2.3870 (2.8898) time: 5.7175 data: 0.0002 max mem: 71323 +[15:08:37.392221] Epoch: [2] [610/3112] lr: 0.000003 closs: 0.4191 (0.5515) grad_norm: 2.3870 (2.8787) time: 5.7231 data: 0.0002 max mem: 71323 +[15:09:34.644761] Epoch: [2] [620/3112] lr: 0.000003 closs: 0.4191 (0.5480) grad_norm: 2.6845 (2.8987) time: 5.7269 data: 0.0002 max mem: 71323 +[15:10:31.846178] Epoch: [2] [630/3112] lr: 0.000003 closs: 0.4704 (0.5503) grad_norm: 2.6845 (2.8876) time: 5.7225 data: 0.0002 max mem: 71323 +[15:11:29.170018] Epoch: [2] [640/3112] lr: 0.000003 closs: 0.4080 (0.5481) grad_norm: 2.6845 (2.8819) time: 5.7261 data: 0.0002 max mem: 71323 +[15:12:26.290687] Epoch: [2] [650/3112] lr: 0.000003 closs: 0.4080 (0.5468) grad_norm: 2.6825 (2.8796) time: 5.7221 data: 0.0002 max mem: 71323 +[15:13:23.489433] Epoch: [2] [660/3112] lr: 0.000003 closs: 0.5232 (0.5478) grad_norm: 2.3870 (2.8602) time: 5.7159 data: 0.0002 max mem: 71323 +[15:14:20.559527] Epoch: [2] [670/3112] lr: 0.000003 closs: 0.5659 (0.5481) grad_norm: 2.2963 (2.8474) time: 5.7133 data: 0.0002 max mem: 71323 +[15:15:17.735895] Epoch: [2] [680/3112] lr: 0.000003 closs: 0.4805 (0.5473) grad_norm: 2.2242 (2.8431) time: 5.7121 data: 0.0002 max mem: 71323 +[15:16:15.001445] Epoch: [2] [690/3112] lr: 0.000003 closs: 0.4814 (0.5520) grad_norm: 2.4723 (2.8633) time: 5.7219 data: 0.0002 max mem: 71323 +[15:17:12.343813] Epoch: [2] [700/3112] lr: 0.000003 closs: 0.4236 (0.5464) grad_norm: 2.2242 (2.8802) time: 5.7302 data: 0.0002 max mem: 71323 +[15:18:09.510911] Epoch: [2] [710/3112] lr: 0.000003 closs: 0.4293 (0.5489) grad_norm: 2.6825 (2.8969) time: 5.7253 data: 0.0002 max mem: 71323 +[15:19:06.688198] Epoch: [2] [720/3112] lr: 0.000003 closs: 0.4392 (0.5471) grad_norm: 2.9360 (2.8956) time: 5.7171 data: 0.0002 max mem: 71323 +[15:20:03.829622] Epoch: [2] [730/3112] lr: 0.000003 closs: 0.3283 (0.5454) grad_norm: 3.2592 (2.9282) time: 5.7158 data: 0.0002 max mem: 71323 +[15:21:01.016940] Epoch: [2] [740/3112] lr: 0.000003 closs: 0.3283 (0.5457) grad_norm: 2.9360 (2.9117) time: 5.7163 data: 0.0002 max mem: 71323 +[15:21:58.228901] Epoch: [2] [750/3112] lr: 0.000003 closs: 0.3429 (0.5423) grad_norm: 2.9360 (2.9010) time: 5.7199 data: 0.0002 max mem: 71323 +[15:22:55.565015] Epoch: [2] [760/3112] lr: 0.000003 closs: 0.4707 (0.5470) grad_norm: 3.3278 (2.9126) time: 5.7273 data: 0.0002 max mem: 71323 +[15:23:52.737133] Epoch: [2] [770/3112] lr: 0.000003 closs: 0.7071 (0.5461) grad_norm: 2.9360 (2.9004) time: 5.7253 data: 0.0002 max mem: 71323 +[15:24:50.021263] Epoch: [2] [780/3112] lr: 0.000003 closs: 0.4724 (0.5477) grad_norm: 3.0359 (2.9363) time: 5.7227 data: 0.0001 max mem: 71323 +[15:25:47.253852] Epoch: [2] [790/3112] lr: 0.000003 closs: 0.6427 (0.5493) grad_norm: 2.9360 (2.9492) time: 5.7257 data: 0.0001 max mem: 71323 +[15:26:44.446685] Epoch: [2] [800/3112] lr: 0.000003 closs: 0.6252 (0.5498) grad_norm: 2.8269 (2.9444) time: 5.7211 data: 0.0002 max mem: 71323 +[15:27:41.615290] Epoch: [2] [810/3112] lr: 0.000003 closs: 0.5673 (0.5490) grad_norm: 2.4856 (2.9577) time: 5.7179 data: 0.0002 max mem: 71323 +[15:28:38.872154] Epoch: [2] [820/3112] lr: 0.000003 closs: 0.3670 (0.5480) grad_norm: 2.2165 (2.9349) time: 5.7211 data: 0.0002 max mem: 71323 +[15:29:36.133987] Epoch: [2] [830/3112] lr: 0.000003 closs: 0.5225 (0.5519) grad_norm: 2.4856 (2.9325) time: 5.7258 data: 0.0002 max mem: 71323 +[15:30:33.399475] Epoch: [2] [840/3112] lr: 0.000003 closs: 0.7116 (0.5533) grad_norm: 2.4856 (2.9689) time: 5.7263 data: 0.0001 max mem: 71323 +[15:31:30.790390] Epoch: [2] [850/3112] lr: 0.000003 closs: 0.6568 (0.5536) grad_norm: 3.1141 (2.9683) time: 5.7327 data: 0.0002 max mem: 71323 +[15:32:28.127673] Epoch: [2] [860/3112] lr: 0.000003 closs: 0.4290 (0.5534) grad_norm: 2.4483 (2.9627) time: 5.7363 data: 0.0002 max mem: 71323 +[15:33:25.397189] Epoch: [2] [870/3112] lr: 0.000003 closs: 0.5605 (0.5547) grad_norm: 2.7166 (2.9696) time: 5.7302 data: 0.0002 max mem: 71323 +[15:34:22.664758] Epoch: [2] [880/3112] lr: 0.000003 closs: 0.5992 (0.5584) grad_norm: 2.7166 (2.9637) time: 5.7267 data: 0.0002 max mem: 71323 +[15:35:19.827983] Epoch: [2] [890/3112] lr: 0.000003 closs: 0.4973 (0.5583) grad_norm: 2.7166 (2.9642) time: 5.7214 data: 0.0002 max mem: 71323 +[15:36:17.127788] Epoch: [2] [900/3112] lr: 0.000003 closs: 0.4510 (0.5574) grad_norm: 2.8765 (2.9539) time: 5.7231 data: 0.0002 max mem: 71323 +[15:37:14.309242] Epoch: [2] [910/3112] lr: 0.000003 closs: 0.4510 (0.5579) grad_norm: 2.8765 (2.9486) time: 5.7240 data: 0.0002 max mem: 71323 +[15:38:11.615594] Epoch: [2] [920/3112] lr: 0.000003 closs: 0.6863 (0.5582) grad_norm: 2.7166 (2.9577) time: 5.7243 data: 0.0002 max mem: 71323 +[15:39:08.806479] Epoch: [2] [930/3112] lr: 0.000003 closs: 0.6926 (0.5574) grad_norm: 2.7166 (2.9568) time: 5.7247 data: 0.0002 max mem: 71323 +[15:40:06.022434] Epoch: [2] [940/3112] lr: 0.000003 closs: 0.6926 (0.5627) grad_norm: 2.7166 (2.9596) time: 5.7202 data: 0.0002 max mem: 71323 +[15:41:03.239016] Epoch: [2] [950/3112] lr: 0.000003 closs: 0.7976 (0.5648) grad_norm: 2.7133 (2.9711) time: 5.7215 data: 0.0002 max mem: 71323 +[15:42:00.479113] Epoch: [2] [960/3112] lr: 0.000003 closs: 0.4479 (0.5614) grad_norm: 2.3917 (2.9499) time: 5.7227 data: 0.0002 max mem: 71323 +[15:42:57.719522] Epoch: [2] [970/3112] lr: 0.000003 closs: 0.2548 (0.5586) grad_norm: 2.7133 (2.9663) time: 5.7239 data: 0.0002 max mem: 71323 +[15:43:55.059654] Epoch: [2] [980/3112] lr: 0.000003 closs: 0.4611 (0.5591) grad_norm: 2.2535 (2.9467) time: 5.7289 data: 0.0002 max mem: 71323 +[15:44:52.360050] Epoch: [2] [990/3112] lr: 0.000003 closs: 0.5401 (0.5574) grad_norm: 2.2535 (2.9366) time: 5.7319 data: 0.0002 max mem: 71323 +[15:45:49.584037] Epoch: [2] [1000/3112] lr: 0.000002 closs: 0.4817 (0.5576) grad_norm: 1.7784 (2.9406) time: 5.7261 data: 0.0002 max mem: 71323 +[15:46:46.958237] Epoch: [2] [1010/3112] lr: 0.000002 closs: 0.5037 (0.5598) grad_norm: 1.7784 (2.9345) time: 5.7298 data: 0.0002 max mem: 71323 +[15:47:44.301404] Epoch: [2] [1020/3112] lr: 0.000002 closs: 0.5697 (0.5593) grad_norm: 2.0977 (2.9493) time: 5.7358 data: 0.0002 max mem: 71323 +[15:48:41.561769] Epoch: [2] [1030/3112] lr: 0.000002 closs: 0.4396 (0.5597) grad_norm: 2.2402 (2.9572) time: 5.7301 data: 0.0002 max mem: 71323 +[15:49:38.767090] Epoch: [2] [1040/3112] lr: 0.000002 closs: 0.3587 (0.5585) grad_norm: 2.2402 (2.9607) time: 5.7232 data: 0.0002 max mem: 71323 +[15:50:36.111608] Epoch: [2] [1050/3112] lr: 0.000002 closs: 0.5382 (0.5600) grad_norm: 1.9305 (2.9632) time: 5.7274 data: 0.0002 max mem: 71323 +[15:51:33.306243] Epoch: [2] [1060/3112] lr: 0.000002 closs: 0.7736 (0.5631) grad_norm: 2.8702 (3.0150) time: 5.7268 data: 0.0002 max mem: 71323 +[15:52:30.482828] Epoch: [2] [1070/3112] lr: 0.000002 closs: 0.6207 (0.5632) grad_norm: 3.1697 (3.0367) time: 5.7184 data: 0.0002 max mem: 71323 +[15:53:27.743712] Epoch: [2] [1080/3112] lr: 0.000002 closs: 0.5278 (0.5635) grad_norm: 3.6272 (3.0471) time: 5.7217 data: 0.0002 max mem: 71323 +[15:54:25.011056] Epoch: [2] [1090/3112] lr: 0.000002 closs: 0.4878 (0.5635) grad_norm: 4.2024 (3.0465) time: 5.7262 data: 0.0002 max mem: 71323 +[15:55:22.172982] Epoch: [2] [1100/3112] lr: 0.000002 closs: 0.4597 (0.5628) grad_norm: 4.2024 (3.0551) time: 5.7213 data: 0.0002 max mem: 71323 +[15:56:19.308356] Epoch: [2] [1110/3112] lr: 0.000002 closs: 0.5194 (0.5645) grad_norm: 3.8334 (3.0453) time: 5.7147 data: 0.0002 max mem: 71323 +[15:57:16.547768] Epoch: [2] [1120/3112] lr: 0.000002 closs: 0.5453 (0.5638) grad_norm: 4.2024 (3.0675) time: 5.7186 data: 0.0002 max mem: 71323 +[15:58:13.704842] Epoch: [2] [1130/3112] lr: 0.000002 closs: 0.4442 (0.5634) grad_norm: 4.2024 (3.0721) time: 5.7197 data: 0.0002 max mem: 71323 +[15:59:10.982523] Epoch: [2] [1140/3112] lr: 0.000002 closs: 0.4511 (0.5635) grad_norm: 3.6272 (3.0681) time: 5.7216 data: 0.0002 max mem: 71323 +[16:00:08.298102] Epoch: [2] [1150/3112] lr: 0.000002 closs: 0.5186 (0.5629) grad_norm: 2.8722 (3.0591) time: 5.7295 data: 0.0002 max mem: 71323 +[16:01:05.492495] Epoch: [2] [1160/3112] lr: 0.000002 closs: 0.4260 (0.5621) grad_norm: 2.8722 (3.0744) time: 5.7253 data: 0.0002 max mem: 71323 +[16:02:02.855099] Epoch: [2] [1170/3112] lr: 0.000002 closs: 0.4111 (0.5611) grad_norm: 2.8722 (3.0902) time: 5.7277 data: 0.0002 max mem: 71323 +[16:03:00.077422] Epoch: [2] [1180/3112] lr: 0.000002 closs: 0.4512 (0.5613) grad_norm: 2.6858 (3.0919) time: 5.7291 data: 0.0002 max mem: 71323 +[16:03:57.264846] Epoch: [2] [1190/3112] lr: 0.000002 closs: 0.4249 (0.5606) grad_norm: 3.0053 (3.0973) time: 5.7203 data: 0.0002 max mem: 71323 +[16:04:54.443387] Epoch: [2] [1200/3112] lr: 0.000002 closs: 0.4314 (0.5602) grad_norm: 3.0053 (3.0949) time: 5.7181 data: 0.0002 max mem: 71323 +[16:05:51.639144] Epoch: [2] [1210/3112] lr: 0.000002 closs: 0.4397 (0.5602) grad_norm: 2.8722 (3.0865) time: 5.7185 data: 0.0002 max mem: 71323 +[16:06:48.774336] Epoch: [2] [1220/3112] lr: 0.000002 closs: 0.3820 (0.5582) grad_norm: 3.0053 (3.0972) time: 5.7164 data: 0.0002 max mem: 71323 +[16:07:45.971894] Epoch: [2] [1230/3112] lr: 0.000002 closs: 0.4727 (0.5595) grad_norm: 3.0607 (3.0997) time: 5.7165 data: 0.0002 max mem: 71323 +[16:08:43.272488] Epoch: [2] [1240/3112] lr: 0.000002 closs: 0.5085 (0.5569) grad_norm: 3.1624 (3.1075) time: 5.7248 data: 0.0002 max mem: 71323 +[16:09:40.468696] Epoch: [2] [1250/3112] lr: 0.000002 closs: 0.4113 (0.5559) grad_norm: 3.1624 (3.1055) time: 5.7247 data: 0.0002 max mem: 71323 +[16:10:37.589011] Epoch: [2] [1260/3112] lr: 0.000002 closs: 0.3655 (0.5538) grad_norm: 3.2363 (3.1033) time: 5.7156 data: 0.0002 max mem: 71323 +[16:11:34.816674] Epoch: [2] [1270/3112] lr: 0.000002 closs: 0.3023 (0.5529) grad_norm: 3.3121 (3.1074) time: 5.7172 data: 0.0002 max mem: 71323 +[16:12:32.099671] Epoch: [2] [1280/3112] lr: 0.000002 closs: 0.4118 (0.5526) grad_norm: 3.2363 (3.1036) time: 5.7254 data: 0.0002 max mem: 71323 +[16:13:29.325442] Epoch: [2] [1290/3112] lr: 0.000002 closs: 0.5622 (0.5534) grad_norm: 3.2552 (3.1040) time: 5.7253 data: 0.0002 max mem: 71323 +[16:14:26.548503] Epoch: [2] [1300/3112] lr: 0.000002 closs: 0.5622 (0.5541) grad_norm: 3.2552 (3.1044) time: 5.7223 data: 0.0001 max mem: 71323 +[16:15:23.756870] Epoch: [2] [1310/3112] lr: 0.000002 closs: 0.4878 (0.5554) grad_norm: 3.2363 (3.1045) time: 5.7214 data: 0.0002 max mem: 71323 +[16:16:21.014296] Epoch: [2] [1320/3112] lr: 0.000002 closs: 0.4807 (0.5549) grad_norm: 3.1002 (3.0954) time: 5.7231 data: 0.0002 max mem: 71323 +[16:17:18.398652] Epoch: [2] [1330/3112] lr: 0.000002 closs: 0.4628 (0.5547) grad_norm: 3.1002 (3.0986) time: 5.7320 data: 0.0001 max mem: 71323 +[16:18:15.699997] Epoch: [2] [1340/3112] lr: 0.000002 closs: 0.4757 (0.5556) grad_norm: 3.0788 (3.0912) time: 5.7342 data: 0.0002 max mem: 71323 +[16:19:12.907753] Epoch: [2] [1350/3112] lr: 0.000002 closs: 0.5646 (0.5561) grad_norm: 2.7291 (3.0898) time: 5.7254 data: 0.0002 max mem: 71323 +[16:20:10.120899] Epoch: [2] [1360/3112] lr: 0.000002 closs: 0.4836 (0.5548) grad_norm: 2.7291 (3.0867) time: 5.7210 data: 0.0002 max mem: 71323 +[16:21:07.452592] Epoch: [2] [1370/3112] lr: 0.000002 closs: 0.5154 (0.5556) grad_norm: 2.7291 (3.0890) time: 5.7271 data: 0.0002 max mem: 71323 +[16:22:04.637249] Epoch: [2] [1380/3112] lr: 0.000002 closs: 0.6741 (0.5564) grad_norm: 2.9465 (3.0927) time: 5.7256 data: 0.0002 max mem: 71323 +[16:23:01.877168] Epoch: [2] [1390/3112] lr: 0.000002 closs: 0.5181 (0.5561) grad_norm: 2.7291 (3.0974) time: 5.7211 data: 0.0002 max mem: 71323 +[16:23:59.174594] Epoch: [2] [1400/3112] lr: 0.000002 closs: 0.5181 (0.5562) grad_norm: 2.7607 (3.0859) time: 5.7268 data: 0.0002 max mem: 71323 +[16:24:56.424658] Epoch: [2] [1410/3112] lr: 0.000002 closs: 0.3618 (0.5547) grad_norm: 2.7607 (3.0818) time: 5.7273 data: 0.0002 max mem: 71323 +[16:25:53.733548] Epoch: [2] [1420/3112] lr: 0.000002 closs: 0.3124 (0.5539) grad_norm: 2.9465 (3.1123) time: 5.7279 data: 0.0002 max mem: 71323 +[16:26:51.017710] Epoch: [2] [1430/3112] lr: 0.000002 closs: 0.3762 (0.5529) grad_norm: 2.7607 (3.1016) time: 5.7295 data: 0.0002 max mem: 71323 +[16:27:48.278496] Epoch: [2] [1440/3112] lr: 0.000002 closs: 0.4757 (0.5528) grad_norm: 2.7607 (3.0935) time: 5.7271 data: 0.0002 max mem: 71323 +[16:28:45.469520] Epoch: [2] [1450/3112] lr: 0.000002 closs: 0.5066 (0.5534) grad_norm: 2.7103 (3.0944) time: 5.7225 data: 0.0002 max mem: 71323 +[16:29:42.600558] Epoch: [2] [1460/3112] lr: 0.000002 closs: 0.3717 (0.5531) grad_norm: 2.7103 (3.1274) time: 5.7160 data: 0.0002 max mem: 71323 +[16:30:39.809573] Epoch: [2] [1470/3112] lr: 0.000002 closs: 0.3273 (0.5528) grad_norm: 2.3646 (3.1227) time: 5.7169 data: 0.0002 max mem: 71323 +[16:31:37.007061] Epoch: [2] [1480/3112] lr: 0.000002 closs: 0.4005 (0.5524) grad_norm: 2.3646 (3.1164) time: 5.7202 data: 0.0002 max mem: 71323 +[16:32:34.425194] Epoch: [2] [1490/3112] lr: 0.000001 closs: 0.4531 (0.5523) grad_norm: 2.3646 (3.1139) time: 5.7306 data: 0.0002 max mem: 71323 +[16:33:31.850056] Epoch: [2] [1500/3112] lr: 0.000001 closs: 0.5008 (0.5516) grad_norm: 2.3646 (3.1160) time: 5.7420 data: 0.0002 max mem: 71323 +[16:34:29.181374] Epoch: [2] [1510/3112] lr: 0.000001 closs: 0.3488 (0.5508) grad_norm: 2.6741 (3.1090) time: 5.7377 data: 0.0001 max mem: 71323 +[16:35:26.377858] Epoch: [2] [1520/3112] lr: 0.000001 closs: 0.3735 (0.5505) grad_norm: 2.6741 (3.1092) time: 5.7263 data: 0.0002 max mem: 71323 +[16:36:23.703953] Epoch: [2] [1530/3112] lr: 0.000001 closs: 0.5611 (0.5511) grad_norm: 2.4420 (3.1031) time: 5.7260 data: 0.0002 max mem: 71323 +[16:37:20.897722] Epoch: [2] [1540/3112] lr: 0.000001 closs: 0.4669 (0.5513) grad_norm: 2.3758 (3.1128) time: 5.7259 data: 0.0002 max mem: 71323 +[16:38:18.051841] Epoch: [2] [1550/3112] lr: 0.000001 closs: 0.4560 (0.5529) grad_norm: 2.6417 (3.1219) time: 5.7172 data: 0.0002 max mem: 71323 +[16:39:15.406671] Epoch: [2] [1560/3112] lr: 0.000001 closs: 0.7055 (0.5544) grad_norm: 3.3556 (3.1320) time: 5.7253 data: 0.0002 max mem: 71323 +[16:40:12.567474] Epoch: [2] [1570/3112] lr: 0.000001 closs: 0.4278 (0.5530) grad_norm: 2.6741 (3.1286) time: 5.7257 data: 0.0002 max mem: 71323 +[16:41:09.707941] Epoch: [2] [1580/3112] lr: 0.000001 closs: 0.3476 (0.5515) grad_norm: 2.9255 (3.1314) time: 5.7150 data: 0.0002 max mem: 71323 +[16:42:06.857718] Epoch: [2] [1590/3112] lr: 0.000001 closs: 0.3187 (0.5502) grad_norm: 2.9660 (3.1308) time: 5.7144 data: 0.0002 max mem: 71323 +[16:43:04.021581] Epoch: [2] [1600/3112] lr: 0.000001 closs: 0.3838 (0.5513) grad_norm: 2.9255 (3.1208) time: 5.7156 data: 0.0002 max mem: 71323 +[16:44:01.336677] Epoch: [2] [1610/3112] lr: 0.000001 closs: 0.6496 (0.5515) grad_norm: 2.9660 (3.1191) time: 5.7238 data: 0.0002 max mem: 71323 +[16:44:58.624495] Epoch: [2] [1620/3112] lr: 0.000001 closs: 0.5132 (0.5516) grad_norm: 2.6096 (3.1053) time: 5.7300 data: 0.0001 max mem: 71323 +[16:45:55.757227] Epoch: [2] [1630/3112] lr: 0.000001 closs: 0.4484 (0.5518) grad_norm: 2.3159 (3.1006) time: 5.7209 data: 0.0002 max mem: 71323 +[16:46:52.959544] Epoch: [2] [1640/3112] lr: 0.000001 closs: 0.4336 (0.5512) grad_norm: 1.9353 (3.0963) time: 5.7167 data: 0.0002 max mem: 71323 +[16:47:50.389121] Epoch: [2] [1650/3112] lr: 0.000001 closs: 0.4271 (0.5518) grad_norm: 1.9353 (3.0914) time: 5.7315 data: 0.0001 max mem: 71323 +[16:48:47.792617] Epoch: [2] [1660/3112] lr: 0.000001 closs: 0.3952 (0.5518) grad_norm: 1.7323 (3.0795) time: 5.7416 data: 0.0002 max mem: 71323 +[16:49:45.070300] Epoch: [2] [1670/3112] lr: 0.000001 closs: 0.3936 (0.5515) grad_norm: 1.7323 (3.0814) time: 5.7340 data: 0.0002 max mem: 71323 +[16:50:42.271190] Epoch: [2] [1680/3112] lr: 0.000001 closs: 0.4496 (0.5504) grad_norm: 2.0028 (3.0962) time: 5.7238 data: 0.0002 max mem: 71323 +[16:51:39.393900] Epoch: [2] [1690/3112] lr: 0.000001 closs: 0.4506 (0.5495) grad_norm: 2.1759 (3.0984) time: 5.7161 data: 0.0002 max mem: 71323 +[16:52:36.555760] Epoch: [2] [1700/3112] lr: 0.000001 closs: 0.5006 (0.5505) grad_norm: 2.6321 (3.1019) time: 5.7141 data: 0.0002 max mem: 71323 +[16:53:33.830935] Epoch: [2] [1710/3112] lr: 0.000001 closs: 0.5749 (0.5506) grad_norm: 2.6321 (3.1057) time: 5.7218 data: 0.0002 max mem: 71323 +[16:54:31.133242] Epoch: [2] [1720/3112] lr: 0.000001 closs: 0.5749 (0.5512) grad_norm: 2.7198 (3.1663) time: 5.7287 data: 0.0002 max mem: 71323 +[16:55:28.239374] Epoch: [2] [1730/3112] lr: 0.000001 closs: 0.5259 (0.5517) grad_norm: 2.8688 (3.1633) time: 5.7203 data: 0.0002 max mem: 71323 +[16:56:25.463747] Epoch: [2] [1740/3112] lr: 0.000001 closs: 0.4380 (0.5515) grad_norm: 3.7357 (3.1638) time: 5.7164 data: 0.0002 max mem: 71323 +[16:57:22.682785] Epoch: [2] [1750/3112] lr: 0.000001 closs: 0.3994 (0.5503) grad_norm: 2.8688 (3.1528) time: 5.7220 data: 0.0002 max mem: 71323 +[16:58:19.939936] Epoch: [2] [1760/3112] lr: 0.000001 closs: 0.4163 (0.5495) grad_norm: 2.5488 (3.1468) time: 5.7237 data: 0.0002 max mem: 71323 +[16:59:17.164538] Epoch: [2] [1770/3112] lr: 0.000001 closs: 0.4802 (0.5495) grad_norm: 2.4022 (3.1397) time: 5.7240 data: 0.0001 max mem: 71323 +[17:00:14.460973] Epoch: [2] [1780/3112] lr: 0.000001 closs: 0.5756 (0.5502) grad_norm: 2.1401 (3.1345) time: 5.7260 data: 0.0001 max mem: 71323 +[17:01:11.617494] Epoch: [2] [1790/3112] lr: 0.000001 closs: 0.5873 (0.5510) grad_norm: 1.9773 (3.1343) time: 5.7226 data: 0.0002 max mem: 71323 +[17:02:08.893937] Epoch: [2] [1800/3112] lr: 0.000001 closs: 0.5847 (0.5514) grad_norm: 1.7235 (3.1407) time: 5.7215 data: 0.0002 max mem: 71323 +[17:03:06.252176] Epoch: [2] [1810/3112] lr: 0.000001 closs: 0.6149 (0.5516) grad_norm: 1.8893 (3.1398) time: 5.7316 data: 0.0001 max mem: 71323 +[17:04:03.529001] Epoch: [2] [1820/3112] lr: 0.000001 closs: 0.4632 (0.5510) grad_norm: 1.8893 (3.1352) time: 5.7316 data: 0.0002 max mem: 71323 +[17:05:00.813998] Epoch: [2] [1830/3112] lr: 0.000001 closs: 0.4433 (0.5510) grad_norm: 1.9773 (3.1355) time: 5.7279 data: 0.0002 max mem: 71323 +[17:05:58.014226] Epoch: [2] [1840/3112] lr: 0.000001 closs: 0.4977 (0.5507) grad_norm: 1.9773 (3.1310) time: 5.7241 data: 0.0002 max mem: 71323 +[17:06:55.313526] Epoch: [2] [1850/3112] lr: 0.000001 closs: 0.4552 (0.5499) grad_norm: 2.3335 (3.1317) time: 5.7248 data: 0.0002 max mem: 71323 +[17:07:52.501250] Epoch: [2] [1860/3112] lr: 0.000001 closs: 0.4585 (0.5493) grad_norm: 2.3968 (3.1289) time: 5.7243 data: 0.0001 max mem: 71323 +[17:08:49.735229] Epoch: [2] [1870/3112] lr: 0.000001 closs: 0.4814 (0.5491) grad_norm: 2.5577 (3.1284) time: 5.7210 data: 0.0002 max mem: 71323 +[17:09:47.042574] Epoch: [2] [1880/3112] lr: 0.000001 closs: 0.4409 (0.5486) grad_norm: 2.6857 (3.1481) time: 5.7270 data: 0.0002 max mem: 71323 +[17:10:44.379177] Epoch: [2] [1890/3112] lr: 0.000001 closs: 0.4395 (0.5485) grad_norm: 2.6857 (3.1458) time: 5.7321 data: 0.0002 max mem: 71323 +[17:11:41.579235] Epoch: [2] [1900/3112] lr: 0.000001 closs: 0.5093 (0.5489) grad_norm: 2.6857 (3.1393) time: 5.7267 data: 0.0002 max mem: 71323 +[17:12:38.860239] Epoch: [2] [1910/3112] lr: 0.000001 closs: 0.5925 (0.5492) grad_norm: 2.8399 (3.1473) time: 5.7240 data: 0.0002 max mem: 71323 +[17:13:36.091801] Epoch: [2] [1920/3112] lr: 0.000001 closs: 0.5174 (0.5487) grad_norm: 3.3990 (3.1500) time: 5.7255 data: 0.0002 max mem: 71323 +[17:14:33.389930] Epoch: [2] [1930/3112] lr: 0.000001 closs: 0.5174 (0.5495) grad_norm: 3.3990 (3.1509) time: 5.7263 data: 0.0002 max mem: 71323 +[17:15:30.703249] Epoch: [2] [1940/3112] lr: 0.000001 closs: 0.6323 (0.5494) grad_norm: 3.3990 (3.1515) time: 5.7304 data: 0.0002 max mem: 71323 +[17:16:27.947736] Epoch: [2] [1950/3112] lr: 0.000001 closs: 0.4128 (0.5483) grad_norm: 2.8977 (3.1472) time: 5.7278 data: 0.0001 max mem: 71323 +[17:17:25.185104] Epoch: [2] [1960/3112] lr: 0.000001 closs: 0.3227 (0.5479) grad_norm: 2.4152 (3.1424) time: 5.7240 data: 0.0002 max mem: 71323 +[17:18:22.540099] Epoch: [2] [1970/3112] lr: 0.000001 closs: 0.5304 (0.5477) grad_norm: 2.8866 (3.1511) time: 5.7295 data: 0.0002 max mem: 71323 +[17:19:19.795766] Epoch: [2] [1980/3112] lr: 0.000001 closs: 0.5862 (0.5480) grad_norm: 2.8866 (3.1461) time: 5.7304 data: 0.0002 max mem: 71323 +[17:20:17.066951] Epoch: [2] [1990/3112] lr: 0.000001 closs: 0.5642 (0.5479) grad_norm: 2.8184 (3.1463) time: 5.7262 data: 0.0002 max mem: 71323 +[17:21:14.246832] Epoch: [2] [2000/3112] lr: 0.000001 closs: 0.4396 (0.5476) grad_norm: 2.8184 (3.1497) time: 5.7224 data: 0.0002 max mem: 71323 +[17:22:11.397298] Epoch: [2] [2010/3112] lr: 0.000001 closs: 0.3506 (0.5468) grad_norm: 2.7756 (3.1451) time: 5.7163 data: 0.0002 max mem: 71323 +[17:23:08.611252] Epoch: [2] [2020/3112] lr: 0.000001 closs: 0.4199 (0.5467) grad_norm: 2.8789 (3.1547) time: 5.7181 data: 0.0002 max mem: 71323 +[17:24:05.743755] Epoch: [2] [2030/3112] lr: 0.000001 closs: 0.5761 (0.5473) grad_norm: 2.8866 (3.1520) time: 5.7172 data: 0.0002 max mem: 71323 +[17:25:03.206602] Epoch: [2] [2040/3112] lr: 0.000001 closs: 0.4013 (0.5468) grad_norm: 3.0206 (3.1499) time: 5.7297 data: 0.0002 max mem: 71323 +[17:26:00.336590] Epoch: [2] [2050/3112] lr: 0.000001 closs: 0.3266 (0.5467) grad_norm: 2.8789 (3.1486) time: 5.7295 data: 0.0002 max mem: 71323 +[17:26:57.498407] Epoch: [2] [2060/3112] lr: 0.000001 closs: 0.5191 (0.5465) grad_norm: 2.8789 (3.1449) time: 5.7145 data: 0.0002 max mem: 71323 +[17:27:54.751474] Epoch: [2] [2070/3112] lr: 0.000001 closs: 0.5391 (0.5471) grad_norm: 2.7310 (3.1389) time: 5.7206 data: 0.0002 max mem: 71323 +[17:28:52.028295] Epoch: [2] [2080/3112] lr: 0.000001 closs: 0.3936 (0.5464) grad_norm: 2.4290 (3.1335) time: 5.7264 data: 0.0002 max mem: 71323 +[17:29:49.277771] Epoch: [2] [2090/3112] lr: 0.000001 closs: 0.4002 (0.5466) grad_norm: 2.4290 (3.1291) time: 5.7262 data: 0.0002 max mem: 71323 +[17:30:46.629098] Epoch: [2] [2100/3112] lr: 0.000001 closs: 0.5508 (0.5468) grad_norm: 2.4290 (3.1293) time: 5.7299 data: 0.0001 max mem: 71323 +[17:31:43.858123] Epoch: [2] [2110/3112] lr: 0.000001 closs: 0.6561 (0.5472) grad_norm: 2.6172 (3.1400) time: 5.7289 data: 0.0001 max mem: 71323 +[17:32:41.040247] Epoch: [2] [2120/3112] lr: 0.000001 closs: 0.5432 (0.5468) grad_norm: 2.6172 (3.1366) time: 5.7205 data: 0.0002 max mem: 71323 +[17:33:38.475514] Epoch: [2] [2130/3112] lr: 0.000001 closs: 0.5101 (0.5470) grad_norm: 2.6172 (3.1362) time: 5.7308 data: 0.0002 max mem: 71323 +[17:34:35.652870] Epoch: [2] [2140/3112] lr: 0.000001 closs: 0.4765 (0.5467) grad_norm: 2.7026 (3.1310) time: 5.7305 data: 0.0002 max mem: 71323 +[17:35:32.956312] Epoch: [2] [2150/3112] lr: 0.000001 closs: 0.4277 (0.5467) grad_norm: 3.0525 (3.1318) time: 5.7239 data: 0.0002 max mem: 71323 +[17:36:30.104571] Epoch: [2] [2160/3112] lr: 0.000001 closs: 0.4548 (0.5469) grad_norm: 3.1398 (3.1305) time: 5.7225 data: 0.0002 max mem: 71323 +[17:37:27.303287] Epoch: [2] [2170/3112] lr: 0.000001 closs: 0.4548 (0.5467) grad_norm: 3.1398 (3.1284) time: 5.7172 data: 0.0002 max mem: 71323 +[17:38:24.568365] Epoch: [2] [2180/3112] lr: 0.000001 closs: 0.4096 (0.5463) grad_norm: 2.7986 (3.1247) time: 5.7230 data: 0.0002 max mem: 71323 +[17:39:21.691117] Epoch: [2] [2190/3112] lr: 0.000000 closs: 0.3783 (0.5468) grad_norm: 2.7026 (3.1220) time: 5.7193 data: 0.0002 max mem: 71323 +[17:40:19.027890] Epoch: [2] [2200/3112] lr: 0.000000 closs: 0.4893 (0.5477) grad_norm: 2.7986 (3.1314) time: 5.7228 data: 0.0002 max mem: 71323 +[17:41:16.408750] Epoch: [2] [2210/3112] lr: 0.000000 closs: 0.5441 (0.5477) grad_norm: 2.7986 (3.1300) time: 5.7357 data: 0.0001 max mem: 71323 +[17:42:13.645756] Epoch: [2] [2220/3112] lr: 0.000000 closs: 0.5059 (0.5475) grad_norm: 2.9381 (3.1312) time: 5.7308 data: 0.0002 max mem: 71323 +[17:43:10.953730] Epoch: [2] [2230/3112] lr: 0.000000 closs: 0.5672 (0.5477) grad_norm: 2.9139 (3.1341) time: 5.7271 data: 0.0002 max mem: 71323 +[17:44:08.411583] Epoch: [2] [2240/3112] lr: 0.000000 closs: 0.5757 (0.5478) grad_norm: 2.8344 (3.1272) time: 5.7381 data: 0.0002 max mem: 71323 +[17:45:05.709238] Epoch: [2] [2250/3112] lr: 0.000000 closs: 0.4157 (0.5473) grad_norm: 2.8344 (3.1272) time: 5.7376 data: 0.0002 max mem: 71323 +[17:46:02.931477] Epoch: [2] [2260/3112] lr: 0.000000 closs: 0.4914 (0.5489) grad_norm: 2.9139 (3.1308) time: 5.7259 data: 0.0002 max mem: 71323 +[17:47:00.195348] Epoch: [2] [2270/3112] lr: 0.000000 closs: 0.5939 (0.5487) grad_norm: 3.0481 (3.1388) time: 5.7242 data: 0.0002 max mem: 71323 +[17:47:57.546292] Epoch: [2] [2280/3112] lr: 0.000000 closs: 0.4314 (0.5498) grad_norm: 2.8344 (3.1296) time: 5.7307 data: 0.0002 max mem: 71323 +[17:48:54.797476] Epoch: [2] [2290/3112] lr: 0.000000 closs: 0.4340 (0.5496) grad_norm: 2.9381 (3.1331) time: 5.7300 data: 0.0002 max mem: 71323 +[17:49:52.070780] Epoch: [2] [2300/3112] lr: 0.000000 closs: 0.3886 (0.5493) grad_norm: 3.2621 (3.1401) time: 5.7261 data: 0.0002 max mem: 71323 +[17:50:49.318417] Epoch: [2] [2310/3112] lr: 0.000000 closs: 0.6357 (0.5512) grad_norm: 2.8344 (3.1360) time: 5.7260 data: 0.0002 max mem: 71323 +[17:51:46.562626] Epoch: [2] [2320/3112] lr: 0.000000 closs: 0.7008 (0.5516) grad_norm: 3.2621 (3.1324) time: 5.7245 data: 0.0002 max mem: 71323 +[17:52:43.699326] Epoch: [2] [2330/3112] lr: 0.000000 closs: 0.5531 (0.5517) grad_norm: 2.7129 (3.1262) time: 5.7189 data: 0.0002 max mem: 71323 +[17:53:40.875209] Epoch: [2] [2340/3112] lr: 0.000000 closs: 0.5495 (0.5516) grad_norm: 2.7129 (3.1275) time: 5.7154 data: 0.0002 max mem: 71323 +[17:54:38.031687] Epoch: [2] [2350/3112] lr: 0.000000 closs: 0.4549 (0.5509) grad_norm: 2.5737 (3.1283) time: 5.7164 data: 0.0001 max mem: 71323 +[17:55:35.448326] Epoch: [2] [2360/3112] lr: 0.000000 closs: 0.4820 (0.5522) grad_norm: 2.7604 (3.1316) time: 5.7285 data: 0.0002 max mem: 71323 +[17:56:32.618934] Epoch: [2] [2370/3112] lr: 0.000000 closs: 0.4820 (0.5515) grad_norm: 2.7604 (3.1348) time: 5.7292 data: 0.0002 max mem: 71323 +[17:57:29.848427] Epoch: [2] [2380/3112] lr: 0.000000 closs: 0.3367 (0.5518) grad_norm: 2.7129 (3.1356) time: 5.7198 data: 0.0002 max mem: 71323 +[17:58:27.087534] Epoch: [2] [2390/3112] lr: 0.000000 closs: 0.4248 (0.5511) grad_norm: 2.5737 (3.1314) time: 5.7232 data: 0.0002 max mem: 71323 +[17:59:24.323990] Epoch: [2] [2400/3112] lr: 0.000000 closs: 0.4248 (0.5505) grad_norm: 2.5737 (3.1284) time: 5.7236 data: 0.0002 max mem: 71323 +[18:00:21.721905] Epoch: [2] [2410/3112] lr: 0.000000 closs: 0.4980 (0.5509) grad_norm: 2.5737 (3.1225) time: 5.7315 data: 0.0002 max mem: 71323 +[18:01:18.891462] Epoch: [2] [2420/3112] lr: 0.000000 closs: 0.5943 (0.5511) grad_norm: 2.5322 (3.1227) time: 5.7282 data: 0.0001 max mem: 71323 +[18:02:16.156798] Epoch: [2] [2430/3112] lr: 0.000000 closs: 0.4906 (0.5516) grad_norm: 2.6653 (3.1219) time: 5.7216 data: 0.0001 max mem: 71323 +[18:03:13.371180] Epoch: [2] [2440/3112] lr: 0.000000 closs: 0.4906 (0.5515) grad_norm: 2.5289 (3.1198) time: 5.7239 data: 0.0002 max mem: 71323 +[18:04:10.711303] Epoch: [2] [2450/3112] lr: 0.000000 closs: 0.3710 (0.5507) grad_norm: 2.1071 (3.1147) time: 5.7276 data: 0.0002 max mem: 71323 +[18:05:08.065059] Epoch: [2] [2460/3112] lr: 0.000000 closs: 0.2865 (0.5500) grad_norm: 2.0226 (3.1079) time: 5.7346 data: 0.0002 max mem: 71323 +[18:06:05.361579] Epoch: [2] [2470/3112] lr: 0.000000 closs: 0.2976 (0.5494) grad_norm: 2.0246 (3.1148) time: 5.7324 data: 0.0002 max mem: 71323 +[18:07:02.494682] Epoch: [2] [2480/3112] lr: 0.000000 closs: 0.3998 (0.5494) grad_norm: 2.4643 (3.1213) time: 5.7214 data: 0.0002 max mem: 71323 +[18:07:59.769053] Epoch: [2] [2490/3112] lr: 0.000000 closs: 0.4586 (0.5489) grad_norm: 2.5096 (3.1168) time: 5.7202 data: 0.0002 max mem: 71323 +[18:08:57.048285] Epoch: [2] [2500/3112] lr: 0.000000 closs: 0.4586 (0.5484) grad_norm: 2.0856 (3.1118) time: 5.7275 data: 0.0002 max mem: 71323 +[18:09:54.323705] Epoch: [2] [2510/3112] lr: 0.000000 closs: 0.4033 (0.5483) grad_norm: 2.0856 (3.1112) time: 5.7276 data: 0.0002 max mem: 71323 +[18:10:51.560600] Epoch: [2] [2520/3112] lr: 0.000000 closs: 0.5089 (0.5499) grad_norm: 2.0508 (3.1087) time: 5.7255 data: 0.0002 max mem: 71323 +[18:11:48.683196] Epoch: [2] [2530/3112] lr: 0.000000 closs: 0.6246 (0.5501) grad_norm: 2.0856 (3.1085) time: 5.7178 data: 0.0002 max mem: 71323 +[18:12:45.840443] Epoch: [2] [2540/3112] lr: 0.000000 closs: 0.5564 (0.5506) grad_norm: 2.5096 (3.1020) time: 5.7139 data: 0.0002 max mem: 71323 +[18:13:43.114993] Epoch: [2] [2550/3112] lr: 0.000000 closs: 0.4533 (0.5503) grad_norm: 2.0508 (3.0963) time: 5.7215 data: 0.0002 max mem: 71323 +[18:14:40.472532] Epoch: [2] [2560/3112] lr: 0.000000 closs: 0.4501 (0.5501) grad_norm: 2.0429 (3.0926) time: 5.7314 data: 0.0002 max mem: 71323 +[18:15:37.712249] Epoch: [2] [2570/3112] lr: 0.000000 closs: 0.3993 (0.5494) grad_norm: 2.0429 (3.0886) time: 5.7297 data: 0.0002 max mem: 71323 +[18:16:34.850754] Epoch: [2] [2580/3112] lr: 0.000000 closs: 0.3750 (0.5487) grad_norm: 2.0508 (3.0956) time: 5.7187 data: 0.0002 max mem: 71323 +[18:17:31.955211] Epoch: [2] [2590/3112] lr: 0.000000 closs: 0.3750 (0.5481) grad_norm: 1.9941 (3.0913) time: 5.7120 data: 0.0002 max mem: 71323 +[18:18:29.183340] Epoch: [2] [2600/3112] lr: 0.000000 closs: 0.4272 (0.5491) grad_norm: 2.0429 (3.0947) time: 5.7165 data: 0.0002 max mem: 71323 +[18:19:26.555803] Epoch: [2] [2610/3112] lr: 0.000000 closs: 0.4462 (0.5486) grad_norm: 2.2441 (3.0922) time: 5.7299 data: 0.0002 max mem: 71323 +[18:20:23.792996] Epoch: [2] [2620/3112] lr: 0.000000 closs: 0.3844 (0.5484) grad_norm: 2.3066 (3.0896) time: 5.7304 data: 0.0002 max mem: 71323 +[18:21:21.087858] Epoch: [2] [2630/3112] lr: 0.000000 closs: 0.4619 (0.5487) grad_norm: 2.3103 (3.0881) time: 5.7265 data: 0.0002 max mem: 71323 +[18:22:18.318734] Epoch: [2] [2640/3112] lr: 0.000000 closs: 0.4699 (0.5491) grad_norm: 2.6225 (3.0915) time: 5.7261 data: 0.0002 max mem: 71323 +[18:23:15.533664] Epoch: [2] [2650/3112] lr: 0.000000 closs: 0.5859 (0.5494) grad_norm: 2.5562 (3.0876) time: 5.7222 data: 0.0002 max mem: 71323 +[18:24:12.835633] Epoch: [2] [2660/3112] lr: 0.000000 closs: 0.5929 (0.5502) grad_norm: 2.3393 (3.0870) time: 5.7258 data: 0.0002 max mem: 71323 +[18:25:10.048274] Epoch: [2] [2670/3112] lr: 0.000000 closs: 0.4960 (0.5500) grad_norm: 2.5562 (3.0851) time: 5.7256 data: 0.0002 max mem: 71323 +[18:26:07.329746] Epoch: [2] [2680/3112] lr: 0.000000 closs: 0.4874 (0.5503) grad_norm: 2.3385 (3.0815) time: 5.7246 data: 0.0002 max mem: 71323 +[18:27:04.697878] Epoch: [2] [2690/3112] lr: 0.000000 closs: 0.5814 (0.5508) grad_norm: 2.5562 (3.0852) time: 5.7324 data: 0.0002 max mem: 71323 +[18:28:01.890915] Epoch: [2] [2700/3112] lr: 0.000000 closs: 0.5814 (0.5509) grad_norm: 2.3393 (3.0826) time: 5.7279 data: 0.0002 max mem: 71323 +[18:28:59.334470] Epoch: [2] [2710/3112] lr: 0.000000 closs: 0.4852 (0.5502) grad_norm: 2.3393 (3.0823) time: 5.7317 data: 0.0002 max mem: 71323 +[18:29:56.560783] Epoch: [2] [2720/3112] lr: 0.000000 closs: 0.4209 (0.5495) grad_norm: 2.3393 (3.0812) time: 5.7334 data: 0.0002 max mem: 71323 +[18:30:53.739761] Epoch: [2] [2730/3112] lr: 0.000000 closs: 0.3620 (0.5488) grad_norm: 2.5089 (3.0793) time: 5.7202 data: 0.0001 max mem: 71323 +[18:31:50.964199] Epoch: [2] [2740/3112] lr: 0.000000 closs: 0.3586 (0.5485) grad_norm: 2.6375 (3.0829) time: 5.7200 data: 0.0001 max mem: 71323 +[18:32:48.239818] Epoch: [2] [2750/3112] lr: 0.000000 closs: 0.3658 (0.5483) grad_norm: 2.6375 (3.0892) time: 5.7249 data: 0.0002 max mem: 71323 +[18:33:45.322545] Epoch: [2] [2760/3112] lr: 0.000000 closs: 0.3658 (0.5479) grad_norm: 2.6375 (3.0879) time: 5.7178 data: 0.0002 max mem: 71323 +[18:34:42.605499] Epoch: [2] [2770/3112] lr: 0.000000 closs: 0.4803 (0.5476) grad_norm: 2.6375 (3.1004) time: 5.7182 data: 0.0002 max mem: 71323 +[18:35:39.968661] Epoch: [2] [2780/3112] lr: 0.000000 closs: 0.5414 (0.5476) grad_norm: 2.6375 (3.1002) time: 5.7322 data: 0.0001 max mem: 71323 +[18:36:37.252357] Epoch: [2] [2790/3112] lr: 0.000000 closs: 0.5414 (0.5482) grad_norm: 2.5089 (3.0952) time: 5.7322 data: 0.0002 max mem: 71323 +[18:37:34.518033] Epoch: [2] [2800/3112] lr: 0.000000 closs: 0.3933 (0.5481) grad_norm: 2.1094 (3.0972) time: 5.7273 data: 0.0002 max mem: 71323 +[18:38:31.685424] Epoch: [2] [2810/3112] lr: 0.000000 closs: 0.4144 (0.5482) grad_norm: 2.8711 (3.1030) time: 5.7215 data: 0.0002 max mem: 71323 +[18:39:28.869219] Epoch: [2] [2820/3112] lr: 0.000000 closs: 0.4585 (0.5485) grad_norm: 2.1094 (3.0959) time: 5.7174 data: 0.0002 max mem: 71323 +[18:40:26.073828] Epoch: [2] [2830/3112] lr: 0.000000 closs: 0.4968 (0.5488) grad_norm: 2.0664 (3.1092) time: 5.7193 data: 0.0002 max mem: 71323 +[18:41:23.399127] Epoch: [2] [2840/3112] lr: 0.000000 closs: 0.5736 (0.5489) grad_norm: 2.5924 (3.1070) time: 5.7264 data: 0.0002 max mem: 71323 +[18:42:20.566099] Epoch: [2] [2850/3112] lr: 0.000000 closs: 0.5121 (0.5484) grad_norm: 1.9081 (3.1038) time: 5.7245 data: 0.0002 max mem: 71323 +[18:43:17.802917] Epoch: [2] [2860/3112] lr: 0.000000 closs: 0.3677 (0.5478) grad_norm: 1.9081 (3.0977) time: 5.7200 data: 0.0001 max mem: 71323 +[18:44:15.110538] Epoch: [2] [2870/3112] lr: 0.000000 closs: 0.4021 (0.5479) grad_norm: 2.1613 (3.0995) time: 5.7270 data: 0.0002 max mem: 71323 +[18:45:12.397711] Epoch: [2] [2880/3112] lr: 0.000000 closs: 0.5212 (0.5478) grad_norm: 2.3927 (3.0998) time: 5.7296 data: 0.0002 max mem: 71323 +[18:46:09.613855] Epoch: [2] [2890/3112] lr: 0.000000 closs: 0.5212 (0.5478) grad_norm: 2.1613 (3.1078) time: 5.7250 data: 0.0002 max mem: 71323 +[18:47:06.845787] Epoch: [2] [2900/3112] lr: 0.000000 closs: 0.5010 (0.5473) grad_norm: 2.1613 (3.1023) time: 5.7223 data: 0.0002 max mem: 71323 +[18:48:04.094202] Epoch: [2] [2910/3112] lr: 0.000000 closs: 0.4676 (0.5473) grad_norm: 2.3927 (3.1069) time: 5.7239 data: 0.0001 max mem: 71323 +[18:49:01.435558] Epoch: [2] [2920/3112] lr: 0.000000 closs: 0.4501 (0.5467) grad_norm: 2.3927 (3.1040) time: 5.7294 data: 0.0001 max mem: 71323 +[18:49:58.859244] Epoch: [2] [2930/3112] lr: 0.000000 closs: 0.4652 (0.5471) grad_norm: 2.3927 (3.1018) time: 5.7382 data: 0.0001 max mem: 71323 +[18:50:56.156240] Epoch: [2] [2940/3112] lr: 0.000000 closs: 0.6036 (0.5468) grad_norm: 2.4706 (3.0980) time: 5.7359 data: 0.0001 max mem: 71323 +[18:51:53.339660] Epoch: [2] [2950/3112] lr: 0.000000 closs: 0.4337 (0.5472) grad_norm: 2.3927 (3.0941) time: 5.7239 data: 0.0001 max mem: 71323 +[18:52:50.590694] Epoch: [2] [2960/3112] lr: 0.000000 closs: 0.4581 (0.5474) grad_norm: 2.3264 (3.0906) time: 5.7216 data: 0.0002 max mem: 71323 +[18:53:47.802841] Epoch: [2] [2970/3112] lr: 0.000000 closs: 0.6419 (0.5480) grad_norm: 2.0883 (3.0853) time: 5.7231 data: 0.0002 max mem: 71323 +[18:54:44.949414] Epoch: [2] [2980/3112] lr: 0.000000 closs: 0.6419 (0.5485) grad_norm: 2.3264 (3.0833) time: 5.7178 data: 0.0002 max mem: 71323 +[18:55:42.064598] Epoch: [2] [2990/3112] lr: 0.000000 closs: 0.4028 (0.5483) grad_norm: 2.3264 (3.0819) time: 5.7130 data: 0.0002 max mem: 71323 +[18:56:39.329764] Epoch: [2] [3000/3112] lr: 0.000000 closs: 0.5313 (0.5484) grad_norm: 1.5794 (3.0786) time: 5.7189 data: 0.0002 max mem: 71323 +[18:57:36.399645] Epoch: [2] [3010/3112] lr: 0.000000 closs: 0.4671 (0.5477) grad_norm: 2.3264 (3.0829) time: 5.7167 data: 0.0002 max mem: 71323 +[18:58:33.665085] Epoch: [2] [3020/3112] lr: 0.000000 closs: 0.3776 (0.5474) grad_norm: 2.3606 (3.0827) time: 5.7167 data: 0.0002 max mem: 71323 +[18:59:31.025743] Epoch: [2] [3030/3112] lr: 0.000000 closs: 0.5004 (0.5479) grad_norm: 2.5170 (3.0842) time: 5.7312 data: 0.0002 max mem: 71323 +[19:00:28.304200] Epoch: [2] [3040/3112] lr: 0.000000 closs: 0.7527 (0.5490) grad_norm: 2.6338 (3.1124) time: 5.7319 data: 0.0002 max mem: 71323 +[19:01:25.587126] Epoch: [2] [3050/3112] lr: 0.000000 closs: 0.8295 (0.5493) grad_norm: 2.7310 (3.1111) time: 5.7279 data: 0.0002 max mem: 71323 +[19:02:22.787151] Epoch: [2] [3060/3112] lr: 0.000000 closs: 0.5859 (0.5498) grad_norm: 2.7310 (3.1106) time: 5.7240 data: 0.0001 max mem: 71323 +[19:03:19.998301] Epoch: [2] [3070/3112] lr: 0.000000 closs: 0.4667 (0.5492) grad_norm: 2.3187 (3.1070) time: 5.7205 data: 0.0002 max mem: 71323 +[19:04:17.367281] Epoch: [2] [3080/3112] lr: 0.000000 closs: 0.2941 (0.5488) grad_norm: 3.4401 (3.1087) time: 5.7289 data: 0.0002 max mem: 71323 +[19:05:14.771097] Epoch: [2] [3090/3112] lr: 0.000000 closs: 0.4463 (0.5491) grad_norm: 2.9133 (3.1143) time: 5.7385 data: 0.0002 max mem: 71323 +[19:06:12.166761] Epoch: [2] [3100/3112] lr: 0.000000 closs: 0.4048 (0.5486) grad_norm: 2.3187 (3.1108) time: 5.7398 data: 0.0002 max mem: 71323 +[19:07:09.372369] Epoch: [2] [3110/3112] lr: 0.000000 closs: 0.3660 (0.5479) grad_norm: 2.9133 (3.1197) time: 5.7300 data: 0.0001 max mem: 71323 +[19:07:15.465086] Epoch: [2] Total time: 4:56:56 +[19:07:15.494672] Averaged stats: lr: 0.000000 closs: 0.3782 (0.5619) grad_norm: 2.9133 (3.1170) +[19:07:15.664578] model saved +[19:07:17.413226] optimizer saved +[19:07:17.413937] other rank-common saved +[19:07:17.417500] rank-specific saved +[19:07:17.417684] Training time 14:51:05