diff --git a/rwkv-0.pth b/rwkv-0.pth new file mode 100644 index 0000000000000000000000000000000000000000..842e5b5de2d27f281012f27f8191457ae9aee5da --- /dev/null +++ b/rwkv-0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e22a3e4960de80b4861520bb6b20de110a397d9d76d0d90a3eb7c1a97f94bdd9 +size 217209741 diff --git a/rwkv-100.pth b/rwkv-100.pth new file mode 100644 index 0000000000000000000000000000000000000000..75f1614a0dbf3bf4d49ae1d2374342f587307a2c --- /dev/null +++ b/rwkv-100.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7c820de4cb8a5f73ec7671b2f1f53235d456ff0b2ec142995c82118b0332c8b5 +size 217209947 diff --git a/rwkv-102.pth b/rwkv-102.pth new file mode 100644 index 0000000000000000000000000000000000000000..c214fcc1f0be03c20521129b2a3b899eaac58a7e --- /dev/null +++ b/rwkv-102.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:50625ef000c7d5d124044b89d84453e0e1b444e7e7c723c3d292c7719c465340 +size 217209947 diff --git a/rwkv-106.pth b/rwkv-106.pth new file mode 100644 index 0000000000000000000000000000000000000000..dc594d8f02e68fa7f5649eaf3871aca9bd48ba52 --- /dev/null +++ b/rwkv-106.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f81d7289e4f89c2ed6fc0c3970917c083f70a8fcafa6106c95ca12854a3a4fda +size 217209947 diff --git a/rwkv-11.pth b/rwkv-11.pth new file mode 100644 index 0000000000000000000000000000000000000000..50061795f3642a4e1511df5f97cfad64b75f1259 --- /dev/null +++ b/rwkv-11.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:478a041d2d06e241b5119734785d4a15bcbb131a7084dfcae3e564e12159cbdb +size 217209812 diff --git a/rwkv-114.pth b/rwkv-114.pth new file mode 100644 index 0000000000000000000000000000000000000000..0a14ae00ba59a32b0c8b2d397121d93315a62453 --- /dev/null +++ b/rwkv-114.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c2509b32e34528e9c26ab45bf331da41dea1c5b49010d7d6c38a43ee53f6b11c +size 217209947 diff --git a/rwkv-12.pth b/rwkv-12.pth new file mode 100644 index 0000000000000000000000000000000000000000..55c44a8039c85eb03d0dd0bdfc6f3180788a84c9 --- /dev/null +++ b/rwkv-12.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d7cf874a4fc10e521c6026798e8686b6c7995471fdbdc4f885d1f82401026247 +size 217209812 diff --git a/rwkv-128.pth b/rwkv-128.pth new file mode 100644 index 0000000000000000000000000000000000000000..e9b7d63771e15c5526aace2270ddebcb612597e9 --- /dev/null +++ b/rwkv-128.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cd65318392cab0f0dcca8769706555ce6177d9e26419c9e6972540b2e202e209 +size 217209947 diff --git a/rwkv-132.pth b/rwkv-132.pth new file mode 100644 index 0000000000000000000000000000000000000000..40152cf532c4eb4a3b0e9a97a90a47962102138c --- /dev/null +++ b/rwkv-132.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bf994a8206971ebd39773346f9febba6f0549a7772e904d654e92dffe76b85b5 +size 217209947 diff --git a/rwkv-137.pth b/rwkv-137.pth new file mode 100644 index 0000000000000000000000000000000000000000..2af9f02fef9e8933edf61ab75f6331388e426d8f --- /dev/null +++ b/rwkv-137.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8bebc42b4ee3a015bbb110f11cb5a02e6602af9b998043c7b07aa1a50103848e +size 217209947 diff --git a/rwkv-139.pth b/rwkv-139.pth new file mode 100644 index 0000000000000000000000000000000000000000..0d822527f2558af243344dd24f431210ce643d05 --- /dev/null +++ b/rwkv-139.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cdcc3c243d5e40128ed3657eb6e9493d55a09cfa359dccdf809195b0da9cc4dd +size 217209947 diff --git a/rwkv-140.pth b/rwkv-140.pth new file mode 100644 index 0000000000000000000000000000000000000000..35b14f75868fd0dc553f72b60e54ce92aac11653 --- /dev/null +++ b/rwkv-140.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e7c72c43c1da5e86baf7e9bfbc80341facbc83db0e70e1030e187a13e2603462 +size 217209947 diff --git a/rwkv-143.pth b/rwkv-143.pth new file mode 100644 index 0000000000000000000000000000000000000000..20268f68cd80134e397bdf174ee3ebd5dbe4ce6e --- /dev/null +++ b/rwkv-143.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f386abeaf3dbc3ae9819a95fb0401ffddfe53540045e063cea4de844982dc013 +size 217209947 diff --git a/rwkv-144.pth b/rwkv-144.pth new file mode 100644 index 0000000000000000000000000000000000000000..8bf747895843c4a80e9855a79c368bf3b773a42f --- /dev/null +++ b/rwkv-144.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6193e7ffa5f13f2483cad970077656a3fd4663890176b174f52b2a10336247df +size 217209947 diff --git a/rwkv-148.pth b/rwkv-148.pth new file mode 100644 index 0000000000000000000000000000000000000000..f324ba1b8f480c1cf7878776d77279ea68c1235b --- /dev/null +++ b/rwkv-148.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:14af14a1f338f2a368c33a848c8242ed5539050b08697530c9a60545c625fb1f +size 217209947 diff --git a/rwkv-149.pth b/rwkv-149.pth new file mode 100644 index 0000000000000000000000000000000000000000..fb3c0ed9dfda70d655f9a64e59b8192e63d61b82 --- /dev/null +++ b/rwkv-149.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f6e90b63c380ad272f5d9e798c17dea934aab6d15f7005a0169f7da9ed09c29d +size 217209947 diff --git a/rwkv-152.pth b/rwkv-152.pth new file mode 100644 index 0000000000000000000000000000000000000000..cd63e0fab88fff6533480530b19762a17dd6e293 --- /dev/null +++ b/rwkv-152.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bb6ba2f0bf6e9c56b00a299142a524a42f30bad0f738f945338a9c83a43048a4 +size 217209947 diff --git a/rwkv-166.pth b/rwkv-166.pth new file mode 100644 index 0000000000000000000000000000000000000000..537c782d095c89394bd4ad598ad32b4c449508eb --- /dev/null +++ b/rwkv-166.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9ecd045af0c234178ee9e1f0ff4984eac6ee291cd98b83375819cbf728d3a80d +size 217209947 diff --git a/rwkv-167.pth b/rwkv-167.pth new file mode 100644 index 0000000000000000000000000000000000000000..e243a82b01886ecde267ae70826999fc9dcfd567 --- /dev/null +++ b/rwkv-167.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:258a980ba85ef0e4356c07de687d4e8de7c555d082a955a89c3239b880e2f5ae +size 217209947 diff --git a/rwkv-169.pth b/rwkv-169.pth new file mode 100644 index 0000000000000000000000000000000000000000..761ef24081a8d9795897a42186e77429211da31c --- /dev/null +++ b/rwkv-169.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2dcf6c35aa84c7517a73e1431fb20e8068ac89a2bffc3111be88b562e0584322 +size 217209947 diff --git a/rwkv-17.pth b/rwkv-17.pth new file mode 100644 index 0000000000000000000000000000000000000000..ef64560707c8906e933859bf1186bc3919febfd6 --- /dev/null +++ b/rwkv-17.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4fffef80e77ffa95d5d04430b0b4dc8b01b589e78ac882339a21aa0c3e89eed7 +size 217209812 diff --git a/rwkv-172.pth b/rwkv-172.pth new file mode 100644 index 0000000000000000000000000000000000000000..f71717f13d6a857c2e7677ca735b2067f38dfc77 --- /dev/null +++ b/rwkv-172.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d98267b659a79cddf9e21552f5259b1c0c17e64b04158af8e5ad1b458f1ddfbe +size 217209947 diff --git a/rwkv-18.pth b/rwkv-18.pth new file mode 100644 index 0000000000000000000000000000000000000000..2573e313c00cb24023fb1203252a89ea36bacd84 --- /dev/null +++ b/rwkv-18.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ddc5844ebefd7f2de67402d4254fd26eb703d7b49727b72bb8ab471bf443ec3e +size 217209812 diff --git a/rwkv-190.pth b/rwkv-190.pth new file mode 100644 index 0000000000000000000000000000000000000000..0daafab208ca608410d484545df16e1b80ffed4e --- /dev/null +++ b/rwkv-190.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:48e1892673b0d141c434307ee75e91aa3ac03edc603996382230e237f6c8a86f +size 217209947 diff --git a/rwkv-191.pth b/rwkv-191.pth new file mode 100644 index 0000000000000000000000000000000000000000..d7889aacbc19e9479a67b1a5db270846c6abcd7a --- /dev/null +++ b/rwkv-191.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e1afb80802cece6e06d8dc1fa109f02198bb1b62f3d8433ed2342dab11c444fb +size 217209947 diff --git a/rwkv-193.pth b/rwkv-193.pth new file mode 100644 index 0000000000000000000000000000000000000000..fb16878dedf620ff97c08719573bdb44f355b47d --- /dev/null +++ b/rwkv-193.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6a2253494b543a0ec14e75019934372a7136ab37eb47f85d9ee60c142b88776c +size 217209947 diff --git a/rwkv-195.pth b/rwkv-195.pth new file mode 100644 index 0000000000000000000000000000000000000000..55bc6670d1bcc385fb11f579dc0ad167f13fbcf4 --- /dev/null +++ b/rwkv-195.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5b80f12dfbaf9ae88dc3ad3bb279709f61844e41a729d44e0bc872ad4aecec9a +size 217209947 diff --git a/rwkv-197.pth b/rwkv-197.pth new file mode 100644 index 0000000000000000000000000000000000000000..f6c8411bbce2fae1204f9633c2aa2116a2987d36 --- /dev/null +++ b/rwkv-197.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:285f0b486eaedb9fc43074e3b4c8f985bde2119892df8b5a3909ea72bd0f6d8b +size 217209947 diff --git a/rwkv-20.pth b/rwkv-20.pth new file mode 100644 index 0000000000000000000000000000000000000000..42af33278c9f2efcfa13be4127af22f32ba233ca --- /dev/null +++ b/rwkv-20.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:876ceba049b198b3d37e21140822b2a04b6fe9195907e7dcffbbad78d9a295b4 +size 217209812 diff --git a/rwkv-202.pth b/rwkv-202.pth new file mode 100644 index 0000000000000000000000000000000000000000..008f485915015f371f483c83b007de57cfacc72a --- /dev/null +++ b/rwkv-202.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0f6f71e65bc96f38138b7a44a5ddbcd23a6af6ef0e0248ad524a74d079d27ffa +size 217209947 diff --git a/rwkv-203.pth b/rwkv-203.pth new file mode 100644 index 0000000000000000000000000000000000000000..1937622e4f98cbbc4dfcf28bcad3d1107c4cf512 --- /dev/null +++ b/rwkv-203.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:10c4552123f1cad16b7c53c04b7d909fdff96d961c0f6887158ae49ac0c3f5ae +size 217209947 diff --git a/rwkv-207.pth b/rwkv-207.pth new file mode 100644 index 0000000000000000000000000000000000000000..db69d510aa4facfa9f48a831a15df598a84ce7a8 --- /dev/null +++ b/rwkv-207.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5794efe6885f825e8b0664ce086df138b644d2ab80335927b0901f995f96ea58 +size 217209947 diff --git a/rwkv-214.pth b/rwkv-214.pth new file mode 100644 index 0000000000000000000000000000000000000000..3fede15f4b752899e450969d57fa12f7fc5dea33 --- /dev/null +++ b/rwkv-214.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7a17e429338a4137c89bcb4946acf322d294e335386d7245724b79d0a6bb17b0 +size 217209947 diff --git a/rwkv-215.pth b/rwkv-215.pth new file mode 100644 index 0000000000000000000000000000000000000000..e768996544774c4a28e30066a1b1ca291551841a --- /dev/null +++ b/rwkv-215.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5f70ad9891a533a6e1819de1b99b7f69bff54814c698ad27ed0246ff4429ceae +size 217209947 diff --git a/rwkv-223.pth b/rwkv-223.pth new file mode 100644 index 0000000000000000000000000000000000000000..7d265ae1077d4ab42f51eaebfe1f4168dc7eabc3 --- /dev/null +++ b/rwkv-223.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:07d51abd4506fca0ed23ac0045bf4416fe45cc2d4c049a4cf41fedda5ffc55d8 +size 217209947 diff --git a/rwkv-224.pth b/rwkv-224.pth new file mode 100644 index 0000000000000000000000000000000000000000..1f5ae37fe56d2e428fad5d5bd2dc69ce1cd4ab07 --- /dev/null +++ b/rwkv-224.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b0ba7b58932df54bcde70779f7278b467a3d308a1df3a948e3fae8d174b8ab66 +size 217209947 diff --git a/rwkv-227.pth b/rwkv-227.pth new file mode 100644 index 0000000000000000000000000000000000000000..bd8645975e7d800544cc4321b2a959052e398ed2 --- /dev/null +++ b/rwkv-227.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:27b0e52af3498bcea805a871659e2a1e5bae9c41bdba666247944d4f3b979db8 +size 217209947 diff --git a/rwkv-23.pth b/rwkv-23.pth new file mode 100644 index 0000000000000000000000000000000000000000..9380c57781dcde3357bbde51401f358115702151 --- /dev/null +++ b/rwkv-23.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:177e968f86d3c82fbda9e83c5316808574c9b13e002f84e2590ebdbff71c647d +size 217209812 diff --git a/rwkv-233.pth b/rwkv-233.pth new file mode 100644 index 0000000000000000000000000000000000000000..e082696a0cfe3b91cb0b87867e16eaf92650919b --- /dev/null +++ b/rwkv-233.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:caf101a0ba18f15a39b929b9bca0c3c2e5ee058c1917ee9f3b314d8b1ff6752f +size 217209947 diff --git a/rwkv-237.pth b/rwkv-237.pth new file mode 100644 index 0000000000000000000000000000000000000000..eac3c2ae191191620dfb8ff009e2e65d37f33995 --- /dev/null +++ b/rwkv-237.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:774ba755e5ad5d8ac012472882a3c7117f3e62816df8e039ff85c747a7693a1a +size 217209947 diff --git a/rwkv-239.pth b/rwkv-239.pth new file mode 100644 index 0000000000000000000000000000000000000000..d75a6083a8bc965ddf3f7a313add86981db42cef --- /dev/null +++ b/rwkv-239.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6ad44816feceac5e22f64cd7f5634e32e87f153c0a912f2d5f11a83049ec5801 +size 217209947 diff --git a/rwkv-243.pth b/rwkv-243.pth new file mode 100644 index 0000000000000000000000000000000000000000..b3975d9df30eefeda676c990024cd915913b810a --- /dev/null +++ b/rwkv-243.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8fa2df317442d2b016eb755c8e2ebc0f50db47a6ee71d35cb380357ba25f54cf +size 217209947 diff --git a/rwkv-245.pth b/rwkv-245.pth new file mode 100644 index 0000000000000000000000000000000000000000..fa18992fe6bb8fc9fc1ad9489020d9e1e5198b92 --- /dev/null +++ b/rwkv-245.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f8e63f6925956318ea7580fe03a4e4ef94ef890bb1f821632eafd5084ef72633 +size 217209947 diff --git a/rwkv-246.pth b/rwkv-246.pth new file mode 100644 index 0000000000000000000000000000000000000000..2a4ed22ead2a989ee40eba981f92662cb759b9c5 --- /dev/null +++ b/rwkv-246.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3abb0c85d684119a0ffb9a596d8e5798b4a937c8fbca4830b6f3fe77b1fef0ec +size 217209947 diff --git a/rwkv-247.pth b/rwkv-247.pth new file mode 100644 index 0000000000000000000000000000000000000000..df4415941e6c027377dc2f8bb67b00fd5a0434c5 --- /dev/null +++ b/rwkv-247.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:507937ffe15b0746b8a2bb3e6af0703cd4333b8177d9b78fc2ccfc5db7accd5b +size 217209947 diff --git a/rwkv-251.pth b/rwkv-251.pth new file mode 100644 index 0000000000000000000000000000000000000000..7987deee27bf5cd96993fe97c5269680315f6a3d --- /dev/null +++ b/rwkv-251.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4a7e20f4ae1c3d0127ffe99ad87e9aa634057639a73b52d6ffd75d69c3b4ce1d +size 217209947 diff --git a/rwkv-254.pth b/rwkv-254.pth new file mode 100644 index 0000000000000000000000000000000000000000..1106fbbc5e8c6a6226a05bbabb0773f4d854a413 --- /dev/null +++ b/rwkv-254.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0da8f112056822d2b5f5c2f74aeaa321b01db2443613dc468319d39535c53e20 +size 217209947 diff --git a/rwkv-256.pth b/rwkv-256.pth new file mode 100644 index 0000000000000000000000000000000000000000..e206009e4139661ddd90cd3a3d246eb3cef02844 --- /dev/null +++ b/rwkv-256.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3dac24e2be43caf1d7888aa4139ba65ce868386c0192ba954446764d8714f1f4 +size 217209947 diff --git a/rwkv-258.pth b/rwkv-258.pth new file mode 100644 index 0000000000000000000000000000000000000000..f24fc53019be39cc0e0f235ce1534a3916d38207 --- /dev/null +++ b/rwkv-258.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f6b223f9cce78a6d75919bd214ea25d4c2267be84ae7da0c90114bba4f74d959 +size 217209947 diff --git a/rwkv-260.pth b/rwkv-260.pth new file mode 100644 index 0000000000000000000000000000000000000000..af9d60f633b1dda1a1839cef183bbc513df5f226 --- /dev/null +++ b/rwkv-260.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7a613df2f79fdae669fff67a6edda1b866750727035049430180c6970710f97a +size 217209947 diff --git a/rwkv-262.pth b/rwkv-262.pth new file mode 100644 index 0000000000000000000000000000000000000000..b657d01932c3b6e163ae02397250d2d8ac28441d --- /dev/null +++ b/rwkv-262.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c02fd87d3d5c1ccc3df5d858a66975628207bbc4191aa5e6906d276e7698ebeb +size 217209947 diff --git a/rwkv-269.pth b/rwkv-269.pth new file mode 100644 index 0000000000000000000000000000000000000000..bc1fd273f05600b2e3f28c66918009f703173fdc --- /dev/null +++ b/rwkv-269.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bfea675f26e9b1c762704080775fec25a5076f7b135a280c9ba9c2ea870c2215 +size 217209947 diff --git a/rwkv-276.pth b/rwkv-276.pth new file mode 100644 index 0000000000000000000000000000000000000000..16f710c65f083226258d2103c23a70e40c8bbff1 --- /dev/null +++ b/rwkv-276.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a9713efa8143902d38c3e968b2444de23d26f911e924f558325d3977bc8954f8 +size 217209947 diff --git a/rwkv-279.pth b/rwkv-279.pth new file mode 100644 index 0000000000000000000000000000000000000000..7376c096aaa34bead5e1f6987b4e89664ec7c225 --- /dev/null +++ b/rwkv-279.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c99b82f017228f5c5ccb0e26e713003584dc813e47be98fc31cf2625ad3b4fd +size 217209947 diff --git a/rwkv-285.pth b/rwkv-285.pth new file mode 100644 index 0000000000000000000000000000000000000000..b33a48847e9cfea54efce2221a7d38de988c97a7 --- /dev/null +++ b/rwkv-285.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:976ac41559a7f537219c30db0e7a119b22bfadc88754eb6fa66a1fae63cab284 +size 217209947 diff --git a/rwkv-291.pth b/rwkv-291.pth new file mode 100644 index 0000000000000000000000000000000000000000..8ccd0d60edb701a040fcc1486a8d11da1a0fb3ed --- /dev/null +++ b/rwkv-291.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e6c9b350810bcc475e4f73ac4feec26b17c18c98f45f744452250b2e9ca2a871 +size 217209947 diff --git a/rwkv-292.pth b/rwkv-292.pth new file mode 100644 index 0000000000000000000000000000000000000000..2af330a286424797da427bf00421c5591c79bb63 --- /dev/null +++ b/rwkv-292.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:82868466e7a1f758d413327006629364d8e8e8f4428292677d76634083ba1686 +size 217209947 diff --git a/rwkv-293.pth b/rwkv-293.pth new file mode 100644 index 0000000000000000000000000000000000000000..4f7ef3d1b6c27ce7184aa42804023235056c49e4 --- /dev/null +++ b/rwkv-293.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fa9193fb06f43a7b98c4637a72c84f8423f1b3dcd0e69a833c0ed2e363bd458e +size 217209947 diff --git a/rwkv-294.pth b/rwkv-294.pth new file mode 100644 index 0000000000000000000000000000000000000000..d11ed1eef2a55d5d433551524c49763366458960 --- /dev/null +++ b/rwkv-294.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a78c76f186a1a4050d421b40039d2e02e61dc5f2c784097608d0f76500c2e3c9 +size 217209947 diff --git a/rwkv-295.pth b/rwkv-295.pth new file mode 100644 index 0000000000000000000000000000000000000000..b9b1840572ad49ec57f3a5a858ccb133ec76bae1 --- /dev/null +++ b/rwkv-295.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:593b31ee45585144c9bb8d08c2b8d5574b689459ec9f6b171df66d13567fab1b +size 217209947 diff --git a/rwkv-300.pth b/rwkv-300.pth new file mode 100644 index 0000000000000000000000000000000000000000..0233a4ce1f5870342e33717fba18fb5cc3284f35 --- /dev/null +++ b/rwkv-300.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4e51d03e230a95ad9971279dbcbeb0d72c8bc1d0cc41c7d200516b4719f9d8c9 +size 217209947 diff --git a/rwkv-301.pth b/rwkv-301.pth new file mode 100644 index 0000000000000000000000000000000000000000..fafbe646b480ce41f71e0612bf04c97859a06901 --- /dev/null +++ b/rwkv-301.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0a0c4cd59b7226b692809e31acb4d2786288c30d81f1cf4ad8d5a0adcdc05952 +size 217209947 diff --git a/rwkv-305.pth b/rwkv-305.pth new file mode 100644 index 0000000000000000000000000000000000000000..bfedf3e3a91a3ce20761d8fe34612caaca67a0f8 --- /dev/null +++ b/rwkv-305.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:de6e8b80b7f92422f94aee32bdcb37918d05960eea27808fd0a8d9d91a876c68 +size 217209947 diff --git a/rwkv-312.pth b/rwkv-312.pth new file mode 100644 index 0000000000000000000000000000000000000000..4ce3a94625abc1d646ad070fb054ecdc096f9d67 --- /dev/null +++ b/rwkv-312.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e3200eced34c601cb64f1cdd3456681c8858eddf67f8cb2d5f9a16f9fa21b131 +size 217209947 diff --git a/rwkv-314.pth b/rwkv-314.pth new file mode 100644 index 0000000000000000000000000000000000000000..9a3e4b6a35715ab1e724c8557e7096dda40b601b --- /dev/null +++ b/rwkv-314.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d663055ed5a665b93561709bd5a44130f9d9643a0e464bf87446b291fb5b85a4 +size 217209947 diff --git a/rwkv-316.pth b/rwkv-316.pth new file mode 100644 index 0000000000000000000000000000000000000000..9ad96ecd1da526fe14a7f3f436c6f17075c48a29 --- /dev/null +++ b/rwkv-316.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:13934a5313e6327aa793b3d71f66cfd70dcef92f8b91f94a03fa4d4b89737c76 +size 217209947 diff --git a/rwkv-32.pth b/rwkv-32.pth new file mode 100644 index 0000000000000000000000000000000000000000..deb18512f0b98add434f9dd5b5ffd2c2dc1f739f --- /dev/null +++ b/rwkv-32.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7e0791a33f2cd01efcb624b951445fbf21fed5e911ca6230aa8bfb4aa2565f81 +size 217209812 diff --git a/rwkv-323.pth b/rwkv-323.pth new file mode 100644 index 0000000000000000000000000000000000000000..5b6b140d8237c5fb56d28b12280a98f3f466ae29 --- /dev/null +++ b/rwkv-323.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bbffe7d332f0bdb55d1dea50c67e61e35e14080b4444dc7d920a495f960ae2a3 +size 217209947 diff --git a/rwkv-326.pth b/rwkv-326.pth new file mode 100644 index 0000000000000000000000000000000000000000..fcd1acec7f05b0dc36e739869d2a2a687fa34777 --- /dev/null +++ b/rwkv-326.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9f1335423f71456908d073f98cc6d7ac15a9e566944cc804ccbdc34f041a9582 +size 217209947 diff --git a/rwkv-327.pth b/rwkv-327.pth new file mode 100644 index 0000000000000000000000000000000000000000..acede8f68cff9c5b9bb81d9fd61dfd2452634eb6 --- /dev/null +++ b/rwkv-327.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c99ac2243c32bd55a29b4cbe8aac72c99df365c525e8b286441060d46e6336e +size 217209947 diff --git a/rwkv-332.pth b/rwkv-332.pth new file mode 100644 index 0000000000000000000000000000000000000000..388cd8e8a0922a5688eb129a33a0038024dfa5d4 --- /dev/null +++ b/rwkv-332.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:388513847c9c31dc53c1254e1a70c6fc20a894ab9cc3f637ad22615dd21e3c78 +size 217209947 diff --git a/rwkv-340.pth b/rwkv-340.pth new file mode 100644 index 0000000000000000000000000000000000000000..44408aad23771955e0b6071df192880523f284c0 --- /dev/null +++ b/rwkv-340.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3bc60bbdc407f917aeb1f574987757af1d8446666d88093d9acdb1891fca8c9f +size 217209947 diff --git a/rwkv-347.pth b/rwkv-347.pth new file mode 100644 index 0000000000000000000000000000000000000000..f3f7a90ae23a5f22177d40d23012e432a7a136b1 --- /dev/null +++ b/rwkv-347.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e5bcd237ce308dc120ba984bc8b4b81572531fc240cc8a528679c3246b798294 +size 217209947 diff --git a/rwkv-350.pth b/rwkv-350.pth new file mode 100644 index 0000000000000000000000000000000000000000..3e141bb4726b531e09bffa8c407776452cc419a8 --- /dev/null +++ b/rwkv-350.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e3fc668e430ce305f302b8aec925a3704145e4670f086399b0b45dfc5696d525 +size 217209947 diff --git a/rwkv-353.pth b/rwkv-353.pth new file mode 100644 index 0000000000000000000000000000000000000000..2f54e7d7b4c0a5272a7004d72bfe6e65b14f3fdc --- /dev/null +++ b/rwkv-353.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a59b7bf8e31370a716a0b71d5c902d4b1d777d724569135a0498bbbe369a773f +size 217209947 diff --git a/rwkv-362.pth b/rwkv-362.pth new file mode 100644 index 0000000000000000000000000000000000000000..b3a66eafeff1e073994cfe90676e273f4e3f5755 --- /dev/null +++ b/rwkv-362.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6e2ff3f468ece5ca2a08a676f2afa0abc8b21d4ef5515a40f03222a077d42b96 +size 217209947 diff --git a/rwkv-368.pth b/rwkv-368.pth new file mode 100644 index 0000000000000000000000000000000000000000..d247765b340538210391f6a9ea42ace9e8b374a4 --- /dev/null +++ b/rwkv-368.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8e9bd9333466e751dfa482cce542233ea335e58cb64bdd5a7f7fe88db133bb35 +size 217209947 diff --git a/rwkv-37.pth b/rwkv-37.pth new file mode 100644 index 0000000000000000000000000000000000000000..89b43bdba195e0f49044593e92d20c515ff35999 --- /dev/null +++ b/rwkv-37.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0c6ef4ce0d3cde69e789377ee95be7e637fc3e2a6539f8bf9be5c27c37126640 +size 217209812 diff --git a/rwkv-38.pth b/rwkv-38.pth new file mode 100644 index 0000000000000000000000000000000000000000..c6f01113f535283d9c052310594aabfe08d04312 --- /dev/null +++ b/rwkv-38.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:966fa911af5b90efd23016234c46062392630957cd37bdcb441e461a10c8f778 +size 217209812 diff --git a/rwkv-380.pth b/rwkv-380.pth new file mode 100644 index 0000000000000000000000000000000000000000..295b0566d6aee33e04298778d9fbc5117ddf3413 --- /dev/null +++ b/rwkv-380.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cd3eaa24d2943a79cafe4eb83ec154dad2d5dd85d060caa87e79c35541921cb4 +size 217209947 diff --git a/rwkv-381.pth b/rwkv-381.pth new file mode 100644 index 0000000000000000000000000000000000000000..a88a8c6354be54519983c7480df391d88e20b8c1 --- /dev/null +++ b/rwkv-381.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0be5e276ab9000b73366df921dddfe740af64952b8b702600d2fa7d64760b32f +size 217209947 diff --git a/rwkv-383.pth b/rwkv-383.pth new file mode 100644 index 0000000000000000000000000000000000000000..ae0f7ca61c68e67f4a996f5339478857344649cc --- /dev/null +++ b/rwkv-383.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9d4d2ba3e3b17f27ff73646249d55b809d7d1dab44055501fbbee4e83e0babb3 +size 217209947 diff --git a/rwkv-387.pth b/rwkv-387.pth new file mode 100644 index 0000000000000000000000000000000000000000..b5915d84bb2bae549e0627c27bfd7d4912e6b46b --- /dev/null +++ b/rwkv-387.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:500553b735e8e3d6b6a7c903e2cb9232df0556332eea370dbb4a6c7a2109dda4 +size 217209947 diff --git a/rwkv-389.pth b/rwkv-389.pth new file mode 100644 index 0000000000000000000000000000000000000000..d9662fc0b029b53cff4506f4b9b87e50e75df230 --- /dev/null +++ b/rwkv-389.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:087aff3dfb83251423ce13492c6794e893cf2ffe4b01560224755b161f532d21 +size 217209947 diff --git a/rwkv-391.pth b/rwkv-391.pth new file mode 100644 index 0000000000000000000000000000000000000000..ebe6def028bdc5d2d2ae5c72c518011c209fd164 --- /dev/null +++ b/rwkv-391.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:13a5ad196c8eba9253bd8f9948e226f945356b94f00cba82202ea1a765a0234f +size 217209947 diff --git a/rwkv-395.pth b/rwkv-395.pth new file mode 100644 index 0000000000000000000000000000000000000000..a419f555d7b8530524abf9c1a4013312c6219178 --- /dev/null +++ b/rwkv-395.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:05934fbbf09c411dadd4adea5162d4b01a6f726a931d145fea19068c0c0c16ac +size 217209947 diff --git a/rwkv-399.pth b/rwkv-399.pth new file mode 100644 index 0000000000000000000000000000000000000000..b5a974ee19d1a1edf2eb5a360a39a18d0573e598 --- /dev/null +++ b/rwkv-399.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5aded944e739ee65af2fa4517babd97f6ccf7b44ceff77cc6f89106ba84432cb +size 217209947 diff --git a/rwkv-40.pth b/rwkv-40.pth new file mode 100644 index 0000000000000000000000000000000000000000..f8d9e08e518342e26717038d63906c11f120c28a --- /dev/null +++ b/rwkv-40.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:66514b6e9209d3c0a84893af03c96965f7c87385a92a29a4f70645196ed72d26 +size 217209812 diff --git a/rwkv-400.pth b/rwkv-400.pth new file mode 100644 index 0000000000000000000000000000000000000000..1a391bf9a3077c69f4c7305a9c55586b2b2f2a32 --- /dev/null +++ b/rwkv-400.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4e1ebb34183e765fe6ab2a6b50f7fba7edba837d6c47c59077497ba7caf543e9 +size 217209947 diff --git a/rwkv-402.pth b/rwkv-402.pth new file mode 100644 index 0000000000000000000000000000000000000000..2a82b320239d3cc4b1b8ecfd2712bb925a68f319 --- /dev/null +++ b/rwkv-402.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6794a3e5f8df77346196dea13fbc342f3b9a0a392d7358be57ddb35a12826d01 +size 217209947 diff --git a/rwkv-404.pth b/rwkv-404.pth new file mode 100644 index 0000000000000000000000000000000000000000..30411789fe466da2aab4f6b39b0d2e2389a26945 --- /dev/null +++ b/rwkv-404.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ba8cee53b9e1c0f0a08d6e5f1600c35ea45c8a87769f1c076410ffe26b6909a9 +size 217209947 diff --git a/rwkv-407.pth b/rwkv-407.pth new file mode 100644 index 0000000000000000000000000000000000000000..40e32012be7510d66417ab678c8ceca06dc86ed1 --- /dev/null +++ b/rwkv-407.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:de840085eafb48131ebd5647351179c4eb039b0c86714c3a34db164a0972f6c6 +size 217209947 diff --git a/rwkv-410.pth b/rwkv-410.pth new file mode 100644 index 0000000000000000000000000000000000000000..dd4a43d46ff71d343be8837fd26814eff3447a3d --- /dev/null +++ b/rwkv-410.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:499d471040f9f6173d0183ccc8bcb45e5c60822504f89421230b5a8afb738cfe +size 217209947 diff --git a/rwkv-412.pth b/rwkv-412.pth new file mode 100644 index 0000000000000000000000000000000000000000..e614df97ea757b40e69925bf204ee6bc85c50a8c --- /dev/null +++ b/rwkv-412.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e8408d96806443048302333cb1fbb2c9af25e304a7c75e9380c8e768489b4aff +size 217209947 diff --git a/rwkv-414.pth b/rwkv-414.pth new file mode 100644 index 0000000000000000000000000000000000000000..2f353ecd6f0b194eef5b8e016440b809a753eadf --- /dev/null +++ b/rwkv-414.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d58d8124b186e7ab43610eaabc8de1bd1d66d5ec68f6cce251eef45614693178 +size 217209947 diff --git a/rwkv-418.pth b/rwkv-418.pth new file mode 100644 index 0000000000000000000000000000000000000000..36fa59829128aa07159292bd122103e01836c55f --- /dev/null +++ b/rwkv-418.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1c156aa41933cc87a98677019942e992875020cfd1b808e3daf3912ebe39b9f7 +size 217209947 diff --git a/rwkv-424.pth b/rwkv-424.pth new file mode 100644 index 0000000000000000000000000000000000000000..76f2c256a08d5d23ff8e83f73b623f134397a96b --- /dev/null +++ b/rwkv-424.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4a525a9718e5be24ea13099cf659f90ab5170760843b45bdbf0c60d32ab46998 +size 217209947 diff --git a/rwkv-426.pth b/rwkv-426.pth new file mode 100644 index 0000000000000000000000000000000000000000..40c72f1b6f2f4aaf02d6326f54ecb64f87266b0f --- /dev/null +++ b/rwkv-426.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f42f211db4feb899e19c92ab432ab363514cbc6fdb9ddcd6fc8fecde3885099a +size 217209947 diff --git a/rwkv-433.pth b/rwkv-433.pth new file mode 100644 index 0000000000000000000000000000000000000000..969b0297323a8f1f3ddf03bc3b141f00e1c04672 --- /dev/null +++ b/rwkv-433.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a82c740003c24e225959144d5331223950a70c287caeab8696058253d5196812 +size 217209947 diff --git a/rwkv-438.pth b/rwkv-438.pth new file mode 100644 index 0000000000000000000000000000000000000000..7b0c5c3b630a959fc9adf70cde7e4f273687918f --- /dev/null +++ b/rwkv-438.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e1f234c77e73b9459520b7e4de2428ec8d02bee1fbbb24dac5304f8f9883ad6c +size 217209947 diff --git a/rwkv-442.pth b/rwkv-442.pth new file mode 100644 index 0000000000000000000000000000000000000000..60de8b4be4b5795e63655a3197c158412448dc52 --- /dev/null +++ b/rwkv-442.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8a7ddcea46f8b457e929cd2c7fe500bc68aee27120dcb9bc0f5f2853d431deca +size 217209947 diff --git a/rwkv-445.pth b/rwkv-445.pth new file mode 100644 index 0000000000000000000000000000000000000000..e3ab9141de986c5a3e852bb56cbdd924da138f9d --- /dev/null +++ b/rwkv-445.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:738eee5e7f7e9f5f9db67dbcd05bd4f1b7fca5a0e750853dc6189252142ea723 +size 217209947 diff --git a/rwkv-447.pth b/rwkv-447.pth new file mode 100644 index 0000000000000000000000000000000000000000..977161c02b4033a054e6ccba52e852d77f656b19 --- /dev/null +++ b/rwkv-447.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4612806b01e5a788ae3ad28ec85f8995e7fc5d59db3c72bcc34712f5d7130470 +size 217209947 diff --git a/rwkv-454.pth b/rwkv-454.pth new file mode 100644 index 0000000000000000000000000000000000000000..1c1a3070b27a1c8ac2359ce70a6fb69dccb0ea59 --- /dev/null +++ b/rwkv-454.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:26bd9609c641393a6e62837bd5cd934173cbf9752ec784f333d00e46727fc114 +size 217209947 diff --git a/rwkv-455.pth b/rwkv-455.pth new file mode 100644 index 0000000000000000000000000000000000000000..e87f80fa7a2bbd95836de0577a06a3a6824c6571 --- /dev/null +++ b/rwkv-455.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c1c2b487bdead88ad1357703924350bb6aaec8b206c3420671da18ec29e57275 +size 217209947 diff --git a/rwkv-458.pth b/rwkv-458.pth new file mode 100644 index 0000000000000000000000000000000000000000..c5b9ada8118886ee6052635eb732ca47151d6089 --- /dev/null +++ b/rwkv-458.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d20773907a9e5066138f8dbdad68c02ecb02c5e4e689a51f116caa9d32216ac8 +size 217209947 diff --git a/rwkv-467.pth b/rwkv-467.pth new file mode 100644 index 0000000000000000000000000000000000000000..e918f00b1be820536172cbb330943cc4993d6887 --- /dev/null +++ b/rwkv-467.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:56d8b2490d23f2cc379ad0eb279f8148885b394670333a44fa2c1dd88aaae959 +size 217209947 diff --git a/rwkv-474.pth b/rwkv-474.pth new file mode 100644 index 0000000000000000000000000000000000000000..da4cb08bffdca4db7e6e1e79e8a3cbb45bb812f7 --- /dev/null +++ b/rwkv-474.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:695b9de7b08f276c7b186146f888f8c0e7f82423db0964cceae0625a91e29de4 +size 217209947 diff --git a/rwkv-475.pth b/rwkv-475.pth new file mode 100644 index 0000000000000000000000000000000000000000..88a07fc241fcf8f1d0f5c3fd7b947c1d47fe7484 --- /dev/null +++ b/rwkv-475.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:74cace825cf63a97359fe7614fbde298e17135573725cf23edfcdb63d73ac5a3 +size 217209947 diff --git a/rwkv-477.pth b/rwkv-477.pth new file mode 100644 index 0000000000000000000000000000000000000000..bd57a5d392ac4b17824aa4fd6beec8d83d1ea6a6 --- /dev/null +++ b/rwkv-477.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b1e5955fe5a6c26cf8b9d98e28a57565cd0c86741cd59313749ef7d2d1087404 +size 217209947 diff --git a/rwkv-488.pth b/rwkv-488.pth new file mode 100644 index 0000000000000000000000000000000000000000..575139e853ab8df25f6851124d5616433fa11fc0 --- /dev/null +++ b/rwkv-488.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5cc3e849a1f8fe205bd2bf752e3b7676171d0d2a49be598409397aae117acd45 +size 217209947 diff --git a/rwkv-495.pth b/rwkv-495.pth new file mode 100644 index 0000000000000000000000000000000000000000..4f50a58b754f0b3c96a36be685dd6ecc41903f24 --- /dev/null +++ b/rwkv-495.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9a0a87ca6c5c9938ed08972a92f4e37c1bbaf4b17432ebe9107e1fed5d206262 +size 217209947 diff --git a/rwkv-503.pth b/rwkv-503.pth new file mode 100644 index 0000000000000000000000000000000000000000..18a04e10e1c9eaaf4c93b03ca746b7ffd41adcbe --- /dev/null +++ b/rwkv-503.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ed186c75d35e4cb29db1a3e4631402c4b2e6da6d57466019688d7e11a5dbb0be +size 217209947 diff --git a/rwkv-509.pth b/rwkv-509.pth new file mode 100644 index 0000000000000000000000000000000000000000..d1be7dfb53415381ffd4320e1d3091de2ab08cd1 --- /dev/null +++ b/rwkv-509.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:89f5d9c9a7ed9644b5efd913c4a7499f98fe8cdc0939c57a8b8525be12ca60ec +size 217209947 diff --git a/rwkv-510.pth b/rwkv-510.pth new file mode 100644 index 0000000000000000000000000000000000000000..e2a55b01972ff186912f0e948aab4903ec7572e2 --- /dev/null +++ b/rwkv-510.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8846fdbef5b20e3e65098ad2dda6249aedea787d7aba875082573d45f156d26f +size 217209947 diff --git a/rwkv-511.pth b/rwkv-511.pth new file mode 100644 index 0000000000000000000000000000000000000000..46a81c12874bb92eaea33e822c35262cd7d2023a --- /dev/null +++ b/rwkv-511.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:01ef4f621cd33a4965a705257d627378ea26e8b751be39a70a8557b5f6fb418f +size 217209947 diff --git a/rwkv-526.pth b/rwkv-526.pth new file mode 100644 index 0000000000000000000000000000000000000000..57fbb7fc04ac6a73031b413763711dc5d48d5bc6 --- /dev/null +++ b/rwkv-526.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:76dc8ea52be7f825de762cfbb58f3935b57238930aef8cda62a095c7f8f0fa84 +size 217209947 diff --git a/rwkv-528.pth b/rwkv-528.pth new file mode 100644 index 0000000000000000000000000000000000000000..66e2bbdf05e057f77d8586476681f00245c712ac --- /dev/null +++ b/rwkv-528.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5014093b62eb441519369f12ea1a92eccaa20d844c6507df195c8d2938b58d71 +size 217209947 diff --git a/rwkv-529.pth b/rwkv-529.pth new file mode 100644 index 0000000000000000000000000000000000000000..d6eb58811a3d6c1f3052c7dbe209aa7290842ad2 --- /dev/null +++ b/rwkv-529.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bb72f1cc0365427c3dcc3396a1ae341235e7b5861877b055de360aec98c0d4a3 +size 217209947 diff --git a/rwkv-533.pth b/rwkv-533.pth new file mode 100644 index 0000000000000000000000000000000000000000..d9bd6b509fcb20e857e743e387212533bf49bba7 --- /dev/null +++ b/rwkv-533.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8c75c5cb34a125bc60a6a995a07d94bcabcc96435267feb45768cb09d7c2a952 +size 217209947 diff --git a/rwkv-537.pth b/rwkv-537.pth new file mode 100644 index 0000000000000000000000000000000000000000..b9e3b5325cc24303da657090e35b7f97356d8dbb --- /dev/null +++ b/rwkv-537.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:69197cb4333059713bc400ee098ccb9536c3e494018b2c9961af0805760ad41e +size 217209947 diff --git a/rwkv-541.pth b/rwkv-541.pth new file mode 100644 index 0000000000000000000000000000000000000000..e4ddea6869fa84c7518d643a7b6a3a04f62fc57f --- /dev/null +++ b/rwkv-541.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0de95698204e0bf4e2713b979bf6b3b7ec2468f2756b23a815771b28fb5e3db3 +size 217209947 diff --git a/rwkv-544.pth b/rwkv-544.pth new file mode 100644 index 0000000000000000000000000000000000000000..b657f741ec0df0e925970afdc927caa4e1e4ed36 --- /dev/null +++ b/rwkv-544.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8697899784652d0be8979f22ca820b86609b28070892f99daec54341ca247ddb +size 217209947 diff --git a/rwkv-548.pth b/rwkv-548.pth new file mode 100644 index 0000000000000000000000000000000000000000..da0d79785fc146d1f4e32f936b0698d3b0a258d4 --- /dev/null +++ b/rwkv-548.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c75eec137f34c3457e8c362431fe2ad026192ad665d27a8f7625f629968e7933 +size 217209947 diff --git a/rwkv-549.pth b/rwkv-549.pth new file mode 100644 index 0000000000000000000000000000000000000000..eed6850181d5e5d1b7fd99fadf5f24dfff013b74 --- /dev/null +++ b/rwkv-549.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dae82f8ce6f3e837a2744b29134d9065e97fbb60e522be6aa1ce2c4dc69be4e0 +size 217209947 diff --git a/rwkv-554.pth b/rwkv-554.pth new file mode 100644 index 0000000000000000000000000000000000000000..2da276a03f00de36da4757efdcf818be82beddc5 --- /dev/null +++ b/rwkv-554.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:22defb039184e54965d63ab48c8deced3d01db2b106bd5c1e7c45ba7bfaf0003 +size 217209947 diff --git a/rwkv-560.pth b/rwkv-560.pth new file mode 100644 index 0000000000000000000000000000000000000000..ac71a7854936ebf35cdac95bf5956a08297c0391 --- /dev/null +++ b/rwkv-560.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cd76e4c7876ecbb016497265d3c1f6956db8f129c30ac1a77f39bcc010d55f09 +size 217209947 diff --git a/rwkv-563.pth b/rwkv-563.pth new file mode 100644 index 0000000000000000000000000000000000000000..6a19a72bae237596b2c0c52b8c6572370e932dab --- /dev/null +++ b/rwkv-563.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:427e3e114a6045192956ad8c15de16c00051c9a77958a116a14b1f7ef5fbe271 +size 217209947 diff --git a/rwkv-567.pth b/rwkv-567.pth new file mode 100644 index 0000000000000000000000000000000000000000..463ce882919a90c68986d21ff9599de3bd5c5073 --- /dev/null +++ b/rwkv-567.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0a40388ea7e967a1876d90d34edeb4b164851a292ae3b00b86cee126a520da6d +size 217209947 diff --git a/rwkv-570.pth b/rwkv-570.pth new file mode 100644 index 0000000000000000000000000000000000000000..b82fa24fcc095022a0ac75a107928a5cb33c2489 --- /dev/null +++ b/rwkv-570.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e52c4a1a640a5b749f2264c05f37f968b035325dfa0ea3caee0fb0f181b4cd6a +size 217209947 diff --git a/rwkv-576.pth b/rwkv-576.pth new file mode 100644 index 0000000000000000000000000000000000000000..c2310bc9cdfd55c54286b9ae7de9d68e9d21ce4e --- /dev/null +++ b/rwkv-576.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:72ce193b21e13297b3f6fd4597a2a3fca681b947efaee4db98ffa5af9aa159f4 +size 217209947 diff --git a/rwkv-577.pth b/rwkv-577.pth new file mode 100644 index 0000000000000000000000000000000000000000..003178e4a2c758f63d33088e3b7ec77e87fb6db9 --- /dev/null +++ b/rwkv-577.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:04208a4e55c15cb82a1b5a5a6e54ee8949e68cbe1eb43c5cd17b9e2f7f82f9e9 +size 217209947 diff --git a/rwkv-578.pth b/rwkv-578.pth new file mode 100644 index 0000000000000000000000000000000000000000..a5866152772842bd5ce8d1b3a2b13d51aa5b3902 --- /dev/null +++ b/rwkv-578.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c29207033ad848266e02969e3adc7a5942d63b667f6deebfec58d239c45d2ca1 +size 217209947 diff --git a/rwkv-58.pth b/rwkv-58.pth new file mode 100644 index 0000000000000000000000000000000000000000..7e0f4d251b483274e516d47b93461d37a577afda --- /dev/null +++ b/rwkv-58.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:92409dae4b49e7f71852590d224dc798237da1b9f3bd1b56140bb9ee195946f9 +size 217209812 diff --git a/rwkv-580.pth b/rwkv-580.pth new file mode 100644 index 0000000000000000000000000000000000000000..21383e4ba0dbe5e54aef59ac800a0653002d680c --- /dev/null +++ b/rwkv-580.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:76e61cb2f794ff3d486808118caa259ce9a2114af02531f24d0855fc710bad36 +size 217209947 diff --git a/rwkv-582.pth b/rwkv-582.pth new file mode 100644 index 0000000000000000000000000000000000000000..6b59e5cb18638fbe59f691c93a68741dfab9907a --- /dev/null +++ b/rwkv-582.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:655b1526faabdcd01be891bf86b608c57840184dca1b762771aa6c9fe0e14e36 +size 217209947 diff --git a/rwkv-60.pth b/rwkv-60.pth new file mode 100644 index 0000000000000000000000000000000000000000..e58993f44188b6e7968b35eb941f6cfad2182bd7 --- /dev/null +++ b/rwkv-60.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f1c54751274fdb946a1d3a1a0507c6ed40ef6691f275271bbd91bda475088223 +size 217209812 diff --git a/rwkv-64.pth b/rwkv-64.pth new file mode 100644 index 0000000000000000000000000000000000000000..2f780132c4c94fe07576f8dec90daeb205fb448f --- /dev/null +++ b/rwkv-64.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5f63ead983fe8ce4967436017d7cd1e186c7c0c03cf6d9427e570653222e7a11 +size 217209812 diff --git a/rwkv-70.pth b/rwkv-70.pth new file mode 100644 index 0000000000000000000000000000000000000000..10b1b8390ca6175c24df5d24e693ce4e6e5e3a7b --- /dev/null +++ b/rwkv-70.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1c420ff9903db3c839a67ebd51281befe86bb3ccfe7ee517ec352cf6f8b8c19d +size 217209812 diff --git a/rwkv-72.pth b/rwkv-72.pth new file mode 100644 index 0000000000000000000000000000000000000000..dc1f69a513891f945379324312c38ba87d2e4f6e --- /dev/null +++ b/rwkv-72.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0d742e0ef120a05a56ce3f6a207c48d77957ba820189a4d03bbf1369a8e798fd +size 217209812 diff --git a/rwkv-73.pth b/rwkv-73.pth new file mode 100644 index 0000000000000000000000000000000000000000..762619519f8d734dd2a2744c5da80d0968eaa695 --- /dev/null +++ b/rwkv-73.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5b2824c025cde9fe41d8a75684ce4928c97e061a8bc21e332ad09f39ceedf43f +size 217209812 diff --git a/rwkv-85.pth b/rwkv-85.pth new file mode 100644 index 0000000000000000000000000000000000000000..73a8681580777c64d04bc5f55d88f6d15e261a5b --- /dev/null +++ b/rwkv-85.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a797d47ab59225cd597a97a2d2b7c2634f7bc4dcd9c82549855f4bc07a14f158 +size 217209812 diff --git a/rwkv-86.pth b/rwkv-86.pth new file mode 100644 index 0000000000000000000000000000000000000000..8f68eef7e57064e491e7bdb0decbae2b29af97f1 --- /dev/null +++ b/rwkv-86.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d1bc734e27204b3878b49e5ca07855719b4d97f5930141a3979489bbc81ee89d +size 217209812 diff --git a/rwkv-88.pth b/rwkv-88.pth new file mode 100644 index 0000000000000000000000000000000000000000..eeaf5b0053c6343e8515c19d233b869342406962 --- /dev/null +++ b/rwkv-88.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a2c14584d29674e630d0e0415d084325721465f6b3a06b83c1cb4a5c42df5ca6 +size 217209812 diff --git a/rwkv-91.pth b/rwkv-91.pth new file mode 100644 index 0000000000000000000000000000000000000000..79bc92fe874b32853930618bef093c0ad1320cdd --- /dev/null +++ b/rwkv-91.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:80e845d06cf95d8b5e4ec35b202e941669de0a8e9282d1b46d7438796efd0402 +size 217209812 diff --git a/rwkv-94.pth b/rwkv-94.pth new file mode 100644 index 0000000000000000000000000000000000000000..59c8d0f140778530aba7673b9c1ed8c95b0674f2 --- /dev/null +++ b/rwkv-94.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c7be7b539b31417f83d2c42390c5f9f481a3d4a7c75bcaf8c6a5750db68560c6 +size 217209812 diff --git a/rwkv-95.pth b/rwkv-95.pth new file mode 100644 index 0000000000000000000000000000000000000000..ae808cd61ae4c558681aa2e78d641c881b2e1ff8 --- /dev/null +++ b/rwkv-95.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2ce0c213a0d4db488fcb04c116109d5ef0980da8c8e7312d0b80402ce9e1f304 +size 217209812 diff --git a/rwkv-96.pth b/rwkv-96.pth new file mode 100644 index 0000000000000000000000000000000000000000..462bed919437887ad463a8d19d0f9e071bb25c80 --- /dev/null +++ b/rwkv-96.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:47f8a3b75aebf3fb4339574964d46006007f4d8af785e46c29f9c1a55d52a149 +size 217209812 diff --git a/rwkv-98.pth b/rwkv-98.pth new file mode 100644 index 0000000000000000000000000000000000000000..95a6b869a28c283cca35b82c3561152d8cb6135b --- /dev/null +++ b/rwkv-98.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ae14c61c4220be7838d8d51ee3f48f119ed512852f38476866161c98a6d31a2d +size 217209812 diff --git a/train_log.txt b/train_log.txt new file mode 100644 index 0000000000000000000000000000000000000000..d57e4d60ad0a02ab48feb57eaee3cca107ddee31 --- /dev/null +++ b/train_log.txt @@ -0,0 +1,867 @@ +NEW RUN 2024-09-05-18-14-07 +{'load_model': 'out/L12-D768-x060-assholerwkv/rwkv-init.pth', 'wandb': 'tami4', 'proj_dir': 'out/L12-D768-x060-assholerwkv', 'random_seed': -1, 'train_type': '', 'data_file': 'data/pol_tokenized', 'data_type': 'binidx', 'vocab_size': 8000, 'ctx_len': 2048, 'epoch_steps': 1260, 'epoch_count': 584, 'epoch_begin': 0, 'epoch_save': 1, 'micro_bsz': 32, 'n_layer': 12, 'n_embd': 768, 'dim_att': 768, 'dim_ffn': 2688, 'pre_ffn': 0, 'head_qk': 0, 'tiny_att_dim': 0, 'tiny_att_layer': -999, 'lr_init': 0.0005, 'lr_final': 5e-05, 'warmup_steps': 10, 'beta1': 0.9, 'beta2': 0.99, 'adam_eps': 1e-08, 'grad_cp': 0, 'dropout': 0, 'weight_decay': 0.001, 'weight_decay_final': -1, 'my_pile_version': 1, 'my_pile_stage': 3, 'my_pile_shift': 0, 'my_pile_edecay': 0, 'layerwise_lr': 1, 'ds_bucket_mb': 2, 'my_sample_len': 0, 'my_ffn_shift': 1, 'my_att_shift': 1, 'head_size_a': 64, 'head_size_divisor': 8, 'my_pos_emb': 0, 'load_partial': 0, 'magic_prime': 23580569, 'my_qa_mask': 0, 'my_random_steps': 0, 'my_testing': 'x060', 'my_exit': 99999999, 'my_exit_tokens': 48293025696, 'logger': False, 'enable_checkpointing': False, 'default_root_dir': None, 'gradient_clip_val': 1.0, 'gradient_clip_algorithm': None, 'num_nodes': 1, 'num_processes': None, 'devices': '1', 'gpus': None, 'auto_select_gpus': None, 'tpu_cores': None, 'ipus': None, 'enable_progress_bar': True, 'overfit_batches': 0.0, 'track_grad_norm': -1, 'check_val_every_n_epoch': 100000000000000000000, 'fast_dev_run': False, 'accumulate_grad_batches': None, 'max_epochs': -1, 'min_epochs': None, 'max_steps': -1, 'min_steps': None, 'max_time': None, 'limit_train_batches': None, 'limit_val_batches': None, 'limit_test_batches': None, 'limit_predict_batches': None, 'val_check_interval': None, 'log_every_n_steps': 100000000000000000000, 'accelerator': 'gpu', 'strategy': 'deepspeed_stage_2', 'sync_batchnorm': False, 'precision': 'bf16', 'enable_model_summary': True, 'num_sanity_val_steps': 0, 'resume_from_checkpoint': None, 'profiler': None, 'benchmark': None, 'reload_dataloaders_every_n_epochs': 0, 'auto_lr_find': False, 'replace_sampler_ddp': False, 'detect_anomaly': False, 'auto_scale_batch_size': False, 'plugins': None, 'amp_backend': None, 'amp_level': None, 'move_metrics_to_cpu': False, 'multiple_trainloader_mode': 'max_size_cycle', 'inference_mode': True, 'my_timestamp': '2024-09-05-18-14-07', 'betas': (0.9, 0.99), 'real_bsz': 32, 'run_name': '8000 ctx2048 L12 D768'} +{'zero_allow_untested_optimizer': True, 'zero_optimization': {'stage': 2, 'contiguous_gradients': True, 'overlap_comm': True, 'allgather_partitions': True, 'reduce_scatter': True, 'allgather_bucket_size': 2000000, 'reduce_bucket_size': 2000000, 'sub_group_size': 1000000000000}, 'activation_checkpointing': {'partition_activations': False, 'cpu_checkpointing': False, 'contiguous_memory_optimization': False, 'synchronize_checkpoint_boundary': False}, 'aio': {'block_size': 1048576, 'queue_depth': 8, 'single_submit': False, 'overlap_events': True, 'thread_count': 1}, 'gradient_accumulation_steps': 1, 'train_micro_batch_size_per_gpu': 32, 'gradient_clipping': 1.0, 'bf16': {'enabled': True}} +NEW RUN 2024-09-05-18-15-19 +{'load_model': 'out/L12-D768-x060-assholerwkv/rwkv-init.pth', 'wandb': 'tami4', 'proj_dir': 'out/L12-D768-x060-assholerwkv', 'random_seed': -1, 'train_type': '', 'data_file': 'data/pol_tokenized', 'data_type': 'binidx', 'vocab_size': 8000, 'ctx_len': 2048, 'epoch_steps': 1680, 'epoch_count': 584, 'epoch_begin': 0, 'epoch_save': 1, 'micro_bsz': 24, 'n_layer': 12, 'n_embd': 768, 'dim_att': 768, 'dim_ffn': 2688, 'pre_ffn': 0, 'head_qk': 0, 'tiny_att_dim': 0, 'tiny_att_layer': -999, 'lr_init': 0.0005, 'lr_final': 5e-05, 'warmup_steps': 10, 'beta1': 0.9, 'beta2': 0.99, 'adam_eps': 1e-08, 'grad_cp': 0, 'dropout': 0, 'weight_decay': 0.001, 'weight_decay_final': -1, 'my_pile_version': 1, 'my_pile_stage': 3, 'my_pile_shift': 0, 'my_pile_edecay': 0, 'layerwise_lr': 1, 'ds_bucket_mb': 2, 'my_sample_len': 0, 'my_ffn_shift': 1, 'my_att_shift': 1, 'head_size_a': 64, 'head_size_divisor': 8, 'my_pos_emb': 0, 'load_partial': 0, 'magic_prime': 23580569, 'my_qa_mask': 0, 'my_random_steps': 0, 'my_testing': 'x060', 'my_exit': 99999999, 'my_exit_tokens': 48293025696, 'logger': False, 'enable_checkpointing': False, 'default_root_dir': None, 'gradient_clip_val': 1.0, 'gradient_clip_algorithm': None, 'num_nodes': 1, 'num_processes': None, 'devices': '1', 'gpus': None, 'auto_select_gpus': None, 'tpu_cores': None, 'ipus': None, 'enable_progress_bar': True, 'overfit_batches': 0.0, 'track_grad_norm': -1, 'check_val_every_n_epoch': 100000000000000000000, 'fast_dev_run': False, 'accumulate_grad_batches': None, 'max_epochs': -1, 'min_epochs': None, 'max_steps': -1, 'min_steps': None, 'max_time': None, 'limit_train_batches': None, 'limit_val_batches': None, 'limit_test_batches': None, 'limit_predict_batches': None, 'val_check_interval': None, 'log_every_n_steps': 100000000000000000000, 'accelerator': 'gpu', 'strategy': 'deepspeed_stage_2', 'sync_batchnorm': False, 'precision': 'bf16', 'enable_model_summary': True, 'num_sanity_val_steps': 0, 'resume_from_checkpoint': None, 'profiler': None, 'benchmark': None, 'reload_dataloaders_every_n_epochs': 0, 'auto_lr_find': False, 'replace_sampler_ddp': False, 'detect_anomaly': False, 'auto_scale_batch_size': False, 'plugins': None, 'amp_backend': None, 'amp_level': None, 'move_metrics_to_cpu': False, 'multiple_trainloader_mode': 'max_size_cycle', 'inference_mode': True, 'my_timestamp': '2024-09-05-18-15-19', 'betas': (0.9, 0.99), 'real_bsz': 24, 'run_name': '8000 ctx2048 L12 D768'} +{'zero_allow_untested_optimizer': True, 'zero_optimization': {'stage': 2, 'contiguous_gradients': True, 'overlap_comm': True, 'allgather_partitions': True, 'reduce_scatter': True, 'allgather_bucket_size': 2000000, 'reduce_bucket_size': 2000000, 'sub_group_size': 1000000000000}, 'activation_checkpointing': {'partition_activations': False, 'cpu_checkpointing': False, 'contiguous_memory_optimization': False, 'synchronize_checkpoint_boundary': False}, 'aio': {'block_size': 1048576, 'queue_depth': 8, 'single_submit': False, 'overlap_events': True, 'thread_count': 1}, 'gradient_accumulation_steps': 1, 'train_micro_batch_size_per_gpu': 24, 'gradient_clipping': 1.0, 'bf16': {'enabled': True}} +NEW RUN 2024-09-05-18-16-39 +{'load_model': 'out/L12-D768-x060-assholerwkv/rwkv-init.pth', 'wandb': 'tami4', 'proj_dir': 'out/L12-D768-x060-assholerwkv', 'random_seed': -1, 'train_type': '', 'data_file': 'data/pol_tokenized', 'data_type': 'binidx', 'vocab_size': 8000, 'ctx_len': 2048, 'epoch_steps': 2016, 'epoch_count': 584, 'epoch_begin': 0, 'epoch_save': 1, 'micro_bsz': 20, 'n_layer': 12, 'n_embd': 768, 'dim_att': 768, 'dim_ffn': 2688, 'pre_ffn': 0, 'head_qk': 0, 'tiny_att_dim': 0, 'tiny_att_layer': -999, 'lr_init': 0.0005, 'lr_final': 5e-05, 'warmup_steps': 10, 'beta1': 0.9, 'beta2': 0.99, 'adam_eps': 1e-08, 'grad_cp': 0, 'dropout': 0, 'weight_decay': 0.001, 'weight_decay_final': -1, 'my_pile_version': 1, 'my_pile_stage': 3, 'my_pile_shift': 0, 'my_pile_edecay': 0, 'layerwise_lr': 1, 'ds_bucket_mb': 2, 'my_sample_len': 0, 'my_ffn_shift': 1, 'my_att_shift': 1, 'head_size_a': 64, 'head_size_divisor': 8, 'my_pos_emb': 0, 'load_partial': 0, 'magic_prime': 23580569, 'my_qa_mask': 0, 'my_random_steps': 0, 'my_testing': 'x060', 'my_exit': 99999999, 'my_exit_tokens': 48293025696, 'logger': False, 'enable_checkpointing': False, 'default_root_dir': None, 'gradient_clip_val': 1.0, 'gradient_clip_algorithm': None, 'num_nodes': 1, 'num_processes': None, 'devices': '1', 'gpus': None, 'auto_select_gpus': None, 'tpu_cores': None, 'ipus': None, 'enable_progress_bar': True, 'overfit_batches': 0.0, 'track_grad_norm': -1, 'check_val_every_n_epoch': 100000000000000000000, 'fast_dev_run': False, 'accumulate_grad_batches': None, 'max_epochs': -1, 'min_epochs': None, 'max_steps': -1, 'min_steps': None, 'max_time': None, 'limit_train_batches': None, 'limit_val_batches': None, 'limit_test_batches': None, 'limit_predict_batches': None, 'val_check_interval': None, 'log_every_n_steps': 100000000000000000000, 'accelerator': 'gpu', 'strategy': 'deepspeed_stage_2', 'sync_batchnorm': False, 'precision': 'bf16', 'enable_model_summary': True, 'num_sanity_val_steps': 0, 'resume_from_checkpoint': None, 'profiler': None, 'benchmark': None, 'reload_dataloaders_every_n_epochs': 0, 'auto_lr_find': False, 'replace_sampler_ddp': False, 'detect_anomaly': False, 'auto_scale_batch_size': False, 'plugins': None, 'amp_backend': None, 'amp_level': None, 'move_metrics_to_cpu': False, 'multiple_trainloader_mode': 'max_size_cycle', 'inference_mode': True, 'my_timestamp': '2024-09-05-18-16-39', 'betas': (0.9, 0.99), 'real_bsz': 20, 'run_name': '8000 ctx2048 L12 D768'} +{'zero_allow_untested_optimizer': True, 'zero_optimization': {'stage': 2, 'contiguous_gradients': True, 'overlap_comm': True, 'allgather_partitions': True, 'reduce_scatter': True, 'allgather_bucket_size': 2000000, 'reduce_bucket_size': 2000000, 'sub_group_size': 1000000000000}, 'activation_checkpointing': {'partition_activations': False, 'cpu_checkpointing': False, 'contiguous_memory_optimization': False, 'synchronize_checkpoint_boundary': False}, 'aio': {'block_size': 1048576, 'queue_depth': 8, 'single_submit': False, 'overlap_events': True, 'thread_count': 1}, 'gradient_accumulation_steps': 1, 'train_micro_batch_size_per_gpu': 20, 'gradient_clipping': 1.0, 'bf16': {'enabled': True}} +NEW RUN 2024-09-05-18-17-14 +{'load_model': 'out/L12-D768-x060-assholerwkv/rwkv-init.pth', 'wandb': 'tami4', 'proj_dir': 'out/L12-D768-x060-assholerwkv', 'random_seed': -1, 'train_type': '', 'data_file': 'data/pol_tokenized', 'data_type': 'binidx', 'vocab_size': 8000, 'ctx_len': 2048, 'epoch_steps': 2240, 'epoch_count': 584, 'epoch_begin': 0, 'epoch_save': 1, 'micro_bsz': 18, 'n_layer': 12, 'n_embd': 768, 'dim_att': 768, 'dim_ffn': 2688, 'pre_ffn': 0, 'head_qk': 0, 'tiny_att_dim': 0, 'tiny_att_layer': -999, 'lr_init': 0.0005, 'lr_final': 5e-05, 'warmup_steps': 10, 'beta1': 0.9, 'beta2': 0.99, 'adam_eps': 1e-08, 'grad_cp': 0, 'dropout': 0, 'weight_decay': 0.001, 'weight_decay_final': -1, 'my_pile_version': 1, 'my_pile_stage': 3, 'my_pile_shift': 0, 'my_pile_edecay': 0, 'layerwise_lr': 1, 'ds_bucket_mb': 2, 'my_sample_len': 0, 'my_ffn_shift': 1, 'my_att_shift': 1, 'head_size_a': 64, 'head_size_divisor': 8, 'my_pos_emb': 0, 'load_partial': 0, 'magic_prime': 23580569, 'my_qa_mask': 0, 'my_random_steps': 0, 'my_testing': 'x060', 'my_exit': 99999999, 'my_exit_tokens': 48293025696, 'logger': False, 'enable_checkpointing': False, 'default_root_dir': None, 'gradient_clip_val': 1.0, 'gradient_clip_algorithm': None, 'num_nodes': 1, 'num_processes': None, 'devices': '1', 'gpus': None, 'auto_select_gpus': None, 'tpu_cores': None, 'ipus': None, 'enable_progress_bar': True, 'overfit_batches': 0.0, 'track_grad_norm': -1, 'check_val_every_n_epoch': 100000000000000000000, 'fast_dev_run': False, 'accumulate_grad_batches': None, 'max_epochs': -1, 'min_epochs': None, 'max_steps': -1, 'min_steps': None, 'max_time': None, 'limit_train_batches': None, 'limit_val_batches': None, 'limit_test_batches': None, 'limit_predict_batches': None, 'val_check_interval': None, 'log_every_n_steps': 100000000000000000000, 'accelerator': 'gpu', 'strategy': 'deepspeed_stage_2', 'sync_batchnorm': False, 'precision': 'bf16', 'enable_model_summary': True, 'num_sanity_val_steps': 0, 'resume_from_checkpoint': None, 'profiler': None, 'benchmark': None, 'reload_dataloaders_every_n_epochs': 0, 'auto_lr_find': False, 'replace_sampler_ddp': False, 'detect_anomaly': False, 'auto_scale_batch_size': False, 'plugins': None, 'amp_backend': None, 'amp_level': None, 'move_metrics_to_cpu': False, 'multiple_trainloader_mode': 'max_size_cycle', 'inference_mode': True, 'my_timestamp': '2024-09-05-18-17-14', 'betas': (0.9, 0.99), 'real_bsz': 18, 'run_name': '8000 ctx2048 L12 D768'} +{'zero_allow_untested_optimizer': True, 'zero_optimization': {'stage': 2, 'contiguous_gradients': True, 'overlap_comm': True, 'allgather_partitions': True, 'reduce_scatter': True, 'allgather_bucket_size': 2000000, 'reduce_bucket_size': 2000000, 'sub_group_size': 1000000000000}, 'activation_checkpointing': {'partition_activations': False, 'cpu_checkpointing': False, 'contiguous_memory_optimization': False, 'synchronize_checkpoint_boundary': False}, 'aio': {'block_size': 1048576, 'queue_depth': 8, 'single_submit': False, 'overlap_events': True, 'thread_count': 1}, 'gradient_accumulation_steps': 1, 'train_micro_batch_size_per_gpu': 18, 'gradient_clipping': 1.0, 'bf16': {'enabled': True}} +NEW RUN 2024-09-05-18-17-43 +{'load_model': 'out/L12-D768-x060-assholerwkv/rwkv-init.pth', 'wandb': 'tami4', 'proj_dir': 'out/L12-D768-x060-assholerwkv', 'random_seed': -1, 'train_type': '', 'data_file': 'data/pol_tokenized', 'data_type': 'binidx', 'vocab_size': 8000, 'ctx_len': 2048, 'epoch_steps': 2520, 'epoch_count': 584, 'epoch_begin': 0, 'epoch_save': 1, 'micro_bsz': 16, 'n_layer': 12, 'n_embd': 768, 'dim_att': 768, 'dim_ffn': 2688, 'pre_ffn': 0, 'head_qk': 0, 'tiny_att_dim': 0, 'tiny_att_layer': -999, 'lr_init': 0.0005, 'lr_final': 5e-05, 'warmup_steps': 10, 'beta1': 0.9, 'beta2': 0.99, 'adam_eps': 1e-08, 'grad_cp': 0, 'dropout': 0, 'weight_decay': 0.001, 'weight_decay_final': -1, 'my_pile_version': 1, 'my_pile_stage': 3, 'my_pile_shift': 0, 'my_pile_edecay': 0, 'layerwise_lr': 1, 'ds_bucket_mb': 2, 'my_sample_len': 0, 'my_ffn_shift': 1, 'my_att_shift': 1, 'head_size_a': 64, 'head_size_divisor': 8, 'my_pos_emb': 0, 'load_partial': 0, 'magic_prime': 23580569, 'my_qa_mask': 0, 'my_random_steps': 0, 'my_testing': 'x060', 'my_exit': 99999999, 'my_exit_tokens': 48293025696, 'logger': False, 'enable_checkpointing': False, 'default_root_dir': None, 'gradient_clip_val': 1.0, 'gradient_clip_algorithm': None, 'num_nodes': 1, 'num_processes': None, 'devices': '1', 'gpus': None, 'auto_select_gpus': None, 'tpu_cores': None, 'ipus': None, 'enable_progress_bar': True, 'overfit_batches': 0.0, 'track_grad_norm': -1, 'check_val_every_n_epoch': 100000000000000000000, 'fast_dev_run': False, 'accumulate_grad_batches': None, 'max_epochs': -1, 'min_epochs': None, 'max_steps': -1, 'min_steps': None, 'max_time': None, 'limit_train_batches': None, 'limit_val_batches': None, 'limit_test_batches': None, 'limit_predict_batches': None, 'val_check_interval': None, 'log_every_n_steps': 100000000000000000000, 'accelerator': 'gpu', 'strategy': 'deepspeed_stage_2', 'sync_batchnorm': False, 'precision': 'bf16', 'enable_model_summary': True, 'num_sanity_val_steps': 0, 'resume_from_checkpoint': None, 'profiler': None, 'benchmark': None, 'reload_dataloaders_every_n_epochs': 0, 'auto_lr_find': False, 'replace_sampler_ddp': False, 'detect_anomaly': False, 'auto_scale_batch_size': False, 'plugins': None, 'amp_backend': None, 'amp_level': None, 'move_metrics_to_cpu': False, 'multiple_trainloader_mode': 'max_size_cycle', 'inference_mode': True, 'my_timestamp': '2024-09-05-18-17-43', 'betas': (0.9, 0.99), 'real_bsz': 16, 'run_name': '8000 ctx2048 L12 D768'} +{'zero_allow_untested_optimizer': True, 'zero_optimization': {'stage': 2, 'contiguous_gradients': True, 'overlap_comm': True, 'allgather_partitions': True, 'reduce_scatter': True, 'allgather_bucket_size': 2000000, 'reduce_bucket_size': 2000000, 'sub_group_size': 1000000000000}, 'activation_checkpointing': {'partition_activations': False, 'cpu_checkpointing': False, 'contiguous_memory_optimization': False, 'synchronize_checkpoint_boundary': False}, 'aio': {'block_size': 1048576, 'queue_depth': 8, 'single_submit': False, 'overlap_events': True, 'thread_count': 1}, 'gradient_accumulation_steps': 1, 'train_micro_batch_size_per_gpu': 16, 'gradient_clipping': 1.0, 'bf16': {'enabled': True}} +NEW RUN 2024-09-05-18-18-49 +{'load_model': 'out/L12-D768-x060-assholerwkv/rwkv-init.pth', 'wandb': 'tami4', 'proj_dir': 'out/L12-D768-x060-assholerwkv', 'random_seed': -1, 'train_type': '', 'data_file': 'data/pol_tokenized', 'data_type': 'binidx', 'vocab_size': 8000, 'ctx_len': 2048, 'epoch_steps': 2880, 'epoch_count': 584, 'epoch_begin': 0, 'epoch_save': 1, 'micro_bsz': 14, 'n_layer': 12, 'n_embd': 768, 'dim_att': 768, 'dim_ffn': 2688, 'pre_ffn': 0, 'head_qk': 0, 'tiny_att_dim': 0, 'tiny_att_layer': -999, 'lr_init': 0.0005, 'lr_final': 5e-05, 'warmup_steps': 10, 'beta1': 0.9, 'beta2': 0.99, 'adam_eps': 1e-08, 'grad_cp': 0, 'dropout': 0, 'weight_decay': 0.001, 'weight_decay_final': -1, 'my_pile_version': 1, 'my_pile_stage': 3, 'my_pile_shift': 0, 'my_pile_edecay': 0, 'layerwise_lr': 1, 'ds_bucket_mb': 2, 'my_sample_len': 0, 'my_ffn_shift': 1, 'my_att_shift': 1, 'head_size_a': 64, 'head_size_divisor': 8, 'my_pos_emb': 0, 'load_partial': 0, 'magic_prime': 23580569, 'my_qa_mask': 0, 'my_random_steps': 0, 'my_testing': 'x060', 'my_exit': 99999999, 'my_exit_tokens': 48293025696, 'logger': False, 'enable_checkpointing': False, 'default_root_dir': None, 'gradient_clip_val': 1.0, 'gradient_clip_algorithm': None, 'num_nodes': 1, 'num_processes': None, 'devices': '1', 'gpus': None, 'auto_select_gpus': None, 'tpu_cores': None, 'ipus': None, 'enable_progress_bar': True, 'overfit_batches': 0.0, 'track_grad_norm': -1, 'check_val_every_n_epoch': 100000000000000000000, 'fast_dev_run': False, 'accumulate_grad_batches': None, 'max_epochs': -1, 'min_epochs': None, 'max_steps': -1, 'min_steps': None, 'max_time': None, 'limit_train_batches': None, 'limit_val_batches': None, 'limit_test_batches': None, 'limit_predict_batches': None, 'val_check_interval': None, 'log_every_n_steps': 100000000000000000000, 'accelerator': 'gpu', 'strategy': 'deepspeed_stage_2', 'sync_batchnorm': False, 'precision': 'bf16', 'enable_model_summary': True, 'num_sanity_val_steps': 0, 'resume_from_checkpoint': None, 'profiler': None, 'benchmark': None, 'reload_dataloaders_every_n_epochs': 0, 'auto_lr_find': False, 'replace_sampler_ddp': False, 'detect_anomaly': False, 'auto_scale_batch_size': False, 'plugins': None, 'amp_backend': None, 'amp_level': None, 'move_metrics_to_cpu': False, 'multiple_trainloader_mode': 'max_size_cycle', 'inference_mode': True, 'my_timestamp': '2024-09-05-18-18-49', 'betas': (0.9, 0.99), 'real_bsz': 14, 'run_name': '8000 ctx2048 L12 D768'} +{'zero_allow_untested_optimizer': True, 'zero_optimization': {'stage': 2, 'contiguous_gradients': True, 'overlap_comm': True, 'allgather_partitions': True, 'reduce_scatter': True, 'allgather_bucket_size': 2000000, 'reduce_bucket_size': 2000000, 'sub_group_size': 1000000000000}, 'activation_checkpointing': {'partition_activations': False, 'cpu_checkpointing': False, 'contiguous_memory_optimization': False, 'synchronize_checkpoint_boundary': False}, 'aio': {'block_size': 1048576, 'queue_depth': 8, 'single_submit': False, 'overlap_events': True, 'thread_count': 1}, 'gradient_accumulation_steps': 1, 'train_micro_batch_size_per_gpu': 14, 'gradient_clipping': 1.0, 'bf16': {'enabled': True}} +0 3.595117 36.4200 0.00050000 2024-09-05 21:27:01.559034 0 +1 3.070774 21.5586 0.00049999 2024-09-06 00:35:02.825330 1 +2 2.892551 18.0393 0.00049997 2024-09-06 03:43:04.930434 2 +3 2.808209 16.5802 0.00049995 2024-09-06 06:51:08.334852 3 +4 2.741276 15.5068 0.00049992 2024-09-06 09:59:16.284517 4 +5 2.694558 14.7990 0.00049988 2024-09-06 13:07:19.898543 5 +NEW RUN 2024-09-06-19-21-13 +{'load_model': 'out/L12-D768-x060-assholerwkv/rwkv-5.pth', 'wandb': 'tami4', 'proj_dir': 'out/L12-D768-x060-assholerwkv', 'random_seed': -1, 'train_type': '', 'data_file': 'data/pol_tokenized', 'data_type': 'binidx', 'vocab_size': 8000, 'ctx_len': 2048, 'epoch_steps': 2880, 'epoch_count': 584, 'epoch_begin': 6, 'epoch_save': 1, 'micro_bsz': 14, 'n_layer': 12, 'n_embd': 768, 'dim_att': 768, 'dim_ffn': 2688, 'pre_ffn': 0, 'head_qk': 0, 'tiny_att_dim': 0, 'tiny_att_layer': -999, 'lr_init': 0.0005, 'lr_final': 5e-05, 'warmup_steps': 10, 'beta1': 0.9, 'beta2': 0.99, 'adam_eps': 1e-08, 'grad_cp': 0, 'dropout': 0, 'weight_decay': 0.001, 'weight_decay_final': -1, 'my_pile_version': 1, 'my_pile_stage': 3, 'my_pile_shift': 0, 'my_pile_edecay': 0, 'layerwise_lr': 1, 'ds_bucket_mb': 2, 'my_sample_len': 0, 'my_ffn_shift': 1, 'my_att_shift': 1, 'head_size_a': 64, 'head_size_divisor': 8, 'my_pos_emb': 0, 'load_partial': 0, 'magic_prime': 23580569, 'my_qa_mask': 0, 'my_random_steps': 0, 'my_testing': 'x060', 'my_exit': 99999999, 'my_exit_tokens': 48293025696, 'logger': False, 'enable_checkpointing': False, 'default_root_dir': None, 'gradient_clip_val': 1.0, 'gradient_clip_algorithm': None, 'num_nodes': 1, 'num_processes': None, 'devices': '1', 'gpus': None, 'auto_select_gpus': None, 'tpu_cores': None, 'ipus': None, 'enable_progress_bar': True, 'overfit_batches': 0.0, 'track_grad_norm': -1, 'check_val_every_n_epoch': 100000000000000000000, 'fast_dev_run': False, 'accumulate_grad_batches': None, 'max_epochs': -1, 'min_epochs': None, 'max_steps': -1, 'min_steps': None, 'max_time': None, 'limit_train_batches': None, 'limit_val_batches': None, 'limit_test_batches': None, 'limit_predict_batches': None, 'val_check_interval': None, 'log_every_n_steps': 100000000000000000000, 'accelerator': 'gpu', 'strategy': 'deepspeed_stage_2', 'sync_batchnorm': False, 'precision': 'bf16', 'enable_model_summary': True, 'num_sanity_val_steps': 0, 'resume_from_checkpoint': None, 'profiler': None, 'benchmark': None, 'reload_dataloaders_every_n_epochs': 0, 'auto_lr_find': False, 'replace_sampler_ddp': False, 'detect_anomaly': False, 'auto_scale_batch_size': False, 'plugins': None, 'amp_backend': None, 'amp_level': None, 'move_metrics_to_cpu': False, 'multiple_trainloader_mode': 'max_size_cycle', 'inference_mode': True, 'my_timestamp': '2024-09-06-19-21-13', 'betas': (0.9, 0.99), 'real_bsz': 14, 'run_name': '8000 ctx2048 L12 D768', 'my_pile_prev_p': 4} +{'zero_allow_untested_optimizer': True, 'zero_optimization': {'stage': 2, 'contiguous_gradients': True, 'overlap_comm': True, 'allgather_partitions': True, 'reduce_scatter': True, 'allgather_bucket_size': 2000000, 'reduce_bucket_size': 2000000, 'sub_group_size': 1000000000000}, 'activation_checkpointing': {'partition_activations': False, 'cpu_checkpointing': False, 'contiguous_memory_optimization': False, 'synchronize_checkpoint_boundary': False}, 'aio': {'block_size': 1048576, 'queue_depth': 8, 'single_submit': False, 'overlap_events': True, 'thread_count': 1}, 'gradient_accumulation_steps': 1, 'train_micro_batch_size_per_gpu': 14, 'gradient_clipping': 1.0, 'bf16': {'enabled': True}} +NEW RUN 2024-09-06-20-02-22 +{'load_model': 'out/L12-D768-x060-assholerwkv/rwkv-5.pth', 'wandb': 'tami4', 'proj_dir': 'out/L12-D768-x060-assholerwkv', 'random_seed': -1, 'train_type': '', 'data_file': 'data/pol_tokenized', 'data_type': 'binidx', 'vocab_size': 8000, 'ctx_len': 2048, 'epoch_steps': 5040, 'epoch_count': 584, 'epoch_begin': 6, 'epoch_save': 1, 'micro_bsz': 8, 'n_layer': 12, 'n_embd': 768, 'dim_att': 768, 'dim_ffn': 2688, 'pre_ffn': 0, 'head_qk': 0, 'tiny_att_dim': 0, 'tiny_att_layer': -999, 'lr_init': 0.0005, 'lr_final': 5e-05, 'warmup_steps': 10, 'beta1': 0.9, 'beta2': 0.99, 'adam_eps': 1e-08, 'grad_cp': 0, 'dropout': 0, 'weight_decay': 0.001, 'weight_decay_final': -1, 'my_pile_version': 1, 'my_pile_stage': 3, 'my_pile_shift': 0, 'my_pile_edecay': 0, 'layerwise_lr': 1, 'ds_bucket_mb': 2, 'my_sample_len': 0, 'my_ffn_shift': 1, 'my_att_shift': 1, 'head_size_a': 64, 'head_size_divisor': 8, 'my_pos_emb': 0, 'load_partial': 0, 'magic_prime': 23580569, 'my_qa_mask': 0, 'my_random_steps': 0, 'my_testing': 'x060', 'my_exit': 99999999, 'my_exit_tokens': 48293025696, 'logger': False, 'enable_checkpointing': False, 'default_root_dir': None, 'gradient_clip_val': 1.0, 'gradient_clip_algorithm': None, 'num_nodes': 1, 'num_processes': None, 'devices': '1', 'gpus': None, 'auto_select_gpus': None, 'tpu_cores': None, 'ipus': None, 'enable_progress_bar': True, 'overfit_batches': 0.0, 'track_grad_norm': -1, 'check_val_every_n_epoch': 100000000000000000000, 'fast_dev_run': False, 'accumulate_grad_batches': None, 'max_epochs': -1, 'min_epochs': None, 'max_steps': -1, 'min_steps': None, 'max_time': None, 'limit_train_batches': None, 'limit_val_batches': None, 'limit_test_batches': None, 'limit_predict_batches': None, 'val_check_interval': None, 'log_every_n_steps': 100000000000000000000, 'accelerator': 'gpu', 'strategy': 'deepspeed_stage_2', 'sync_batchnorm': False, 'precision': 'bf16', 'enable_model_summary': True, 'num_sanity_val_steps': 0, 'resume_from_checkpoint': None, 'profiler': None, 'benchmark': None, 'reload_dataloaders_every_n_epochs': 0, 'auto_lr_find': False, 'replace_sampler_ddp': False, 'detect_anomaly': False, 'auto_scale_batch_size': False, 'plugins': None, 'amp_backend': None, 'amp_level': None, 'move_metrics_to_cpu': False, 'multiple_trainloader_mode': 'max_size_cycle', 'inference_mode': True, 'my_timestamp': '2024-09-06-20-02-22', 'betas': (0.9, 0.99), 'real_bsz': 8, 'run_name': '8000 ctx2048 L12 D768', 'my_pile_prev_p': 4} +{'zero_allow_untested_optimizer': True, 'zero_optimization': {'stage': 2, 'contiguous_gradients': True, 'overlap_comm': True, 'allgather_partitions': True, 'reduce_scatter': True, 'allgather_bucket_size': 2000000, 'reduce_bucket_size': 2000000, 'sub_group_size': 1000000000000}, 'activation_checkpointing': {'partition_activations': False, 'cpu_checkpointing': False, 'contiguous_memory_optimization': False, 'synchronize_checkpoint_boundary': False}, 'aio': {'block_size': 1048576, 'queue_depth': 8, 'single_submit': False, 'overlap_events': True, 'thread_count': 1}, 'gradient_accumulation_steps': 1, 'train_micro_batch_size_per_gpu': 8, 'gradient_clipping': 1.0, 'bf16': {'enabled': True}} +NEW RUN 2024-09-09-09-51-03 +{'load_model': 'out/L12-D768-x060-assholerwkv/rwkv-5.pth', 'wandb': 'tami4', 'proj_dir': 'out/L12-D768-x060-assholerwkv', 'random_seed': -1, 'train_type': '', 'data_file': 'data/pol_tokenized', 'data_type': 'binidx', 'vocab_size': 8000, 'ctx_len': 2048, 'epoch_steps': 5040, 'epoch_count': 584, 'epoch_begin': 6, 'epoch_save': 1, 'micro_bsz': 8, 'n_layer': 12, 'n_embd': 768, 'dim_att': 768, 'dim_ffn': 2688, 'pre_ffn': 0, 'head_qk': 0, 'tiny_att_dim': 0, 'tiny_att_layer': -999, 'lr_init': 0.0005, 'lr_final': 5e-05, 'warmup_steps': 10, 'beta1': 0.9, 'beta2': 0.99, 'adam_eps': 1e-08, 'grad_cp': 0, 'dropout': 0, 'weight_decay': 0.001, 'weight_decay_final': -1, 'my_pile_version': 1, 'my_pile_stage': 3, 'my_pile_shift': 0, 'my_pile_edecay': 0, 'layerwise_lr': 1, 'ds_bucket_mb': 2, 'my_sample_len': 0, 'my_ffn_shift': 1, 'my_att_shift': 1, 'head_size_a': 64, 'head_size_divisor': 8, 'my_pos_emb': 0, 'load_partial': 0, 'magic_prime': 23580569, 'my_qa_mask': 0, 'my_random_steps': 0, 'my_testing': 'x060', 'my_exit': 99999999, 'my_exit_tokens': 48293025696, 'logger': False, 'enable_checkpointing': False, 'default_root_dir': None, 'gradient_clip_val': 1.0, 'gradient_clip_algorithm': None, 'num_nodes': 1, 'num_processes': None, 'devices': '1', 'gpus': None, 'auto_select_gpus': None, 'tpu_cores': None, 'ipus': None, 'enable_progress_bar': True, 'overfit_batches': 0.0, 'track_grad_norm': -1, 'check_val_every_n_epoch': 100000000000000000000, 'fast_dev_run': False, 'accumulate_grad_batches': None, 'max_epochs': -1, 'min_epochs': None, 'max_steps': -1, 'min_steps': None, 'max_time': None, 'limit_train_batches': None, 'limit_val_batches': None, 'limit_test_batches': None, 'limit_predict_batches': None, 'val_check_interval': None, 'log_every_n_steps': 100000000000000000000, 'accelerator': 'gpu', 'strategy': 'deepspeed_stage_2', 'sync_batchnorm': False, 'precision': 'bf16', 'enable_model_summary': True, 'num_sanity_val_steps': 0, 'resume_from_checkpoint': None, 'profiler': None, 'benchmark': None, 'reload_dataloaders_every_n_epochs': 0, 'auto_lr_find': False, 'replace_sampler_ddp': False, 'detect_anomaly': False, 'auto_scale_batch_size': False, 'plugins': None, 'amp_backend': None, 'amp_level': None, 'move_metrics_to_cpu': False, 'multiple_trainloader_mode': 'max_size_cycle', 'inference_mode': True, 'my_timestamp': '2024-09-09-09-51-03', 'betas': (0.9, 0.99), 'real_bsz': 8, 'run_name': '8000 ctx2048 L12 D768', 'my_pile_prev_p': 4} +{'zero_allow_untested_optimizer': True, 'zero_optimization': {'stage': 2, 'contiguous_gradients': True, 'overlap_comm': True, 'allgather_partitions': True, 'reduce_scatter': True, 'allgather_bucket_size': 2000000, 'reduce_bucket_size': 2000000, 'sub_group_size': 1000000000000}, 'activation_checkpointing': {'partition_activations': False, 'cpu_checkpointing': False, 'contiguous_memory_optimization': False, 'synchronize_checkpoint_boundary': False}, 'aio': {'block_size': 1048576, 'queue_depth': 8, 'single_submit': False, 'overlap_events': True, 'thread_count': 1}, 'gradient_accumulation_steps': 1, 'train_micro_batch_size_per_gpu': 8, 'gradient_clipping': 1.0, 'bf16': {'enabled': True}} +NEW RUN 2024-09-09-10-46-50 +{'load_model': 'out/L12-D768-x060-assholerwkv/rwkv-5.pth', 'wandb': 'tami4', 'proj_dir': 'out/L12-D768-x060-assholerwkv', 'random_seed': -1, 'train_type': '', 'data_file': 'data/pol_tokenized', 'data_type': 'binidx', 'vocab_size': 8000, 'ctx_len': 2048, 'epoch_steps': 5040, 'epoch_count': 584, 'epoch_begin': 6, 'epoch_save': 1, 'micro_bsz': 8, 'n_layer': 12, 'n_embd': 768, 'dim_att': 768, 'dim_ffn': 2688, 'pre_ffn': 0, 'head_qk': 0, 'tiny_att_dim': 0, 'tiny_att_layer': -999, 'lr_init': 0.0005, 'lr_final': 5e-05, 'warmup_steps': 10, 'beta1': 0.9, 'beta2': 0.99, 'adam_eps': 1e-08, 'grad_cp': 0, 'dropout': 0, 'weight_decay': 0.001, 'weight_decay_final': -1, 'my_pile_version': 1, 'my_pile_stage': 3, 'my_pile_shift': 0, 'my_pile_edecay': 0, 'layerwise_lr': 1, 'ds_bucket_mb': 2, 'my_sample_len': 0, 'my_ffn_shift': 1, 'my_att_shift': 1, 'head_size_a': 64, 'head_size_divisor': 8, 'my_pos_emb': 0, 'load_partial': 0, 'magic_prime': 23580569, 'my_qa_mask': 0, 'my_random_steps': 0, 'my_testing': 'x060', 'my_exit': 99999999, 'my_exit_tokens': 48293025696, 'logger': False, 'enable_checkpointing': False, 'default_root_dir': None, 'gradient_clip_val': 1.0, 'gradient_clip_algorithm': None, 'num_nodes': 1, 'num_processes': None, 'devices': '1', 'gpus': None, 'auto_select_gpus': None, 'tpu_cores': None, 'ipus': None, 'enable_progress_bar': True, 'overfit_batches': 0.0, 'track_grad_norm': -1, 'check_val_every_n_epoch': 100000000000000000000, 'fast_dev_run': False, 'accumulate_grad_batches': None, 'max_epochs': -1, 'min_epochs': None, 'max_steps': -1, 'min_steps': None, 'max_time': None, 'limit_train_batches': None, 'limit_val_batches': None, 'limit_test_batches': None, 'limit_predict_batches': None, 'val_check_interval': None, 'log_every_n_steps': 100000000000000000000, 'accelerator': 'gpu', 'strategy': 'deepspeed_stage_2', 'sync_batchnorm': False, 'precision': 'bf16', 'enable_model_summary': True, 'num_sanity_val_steps': 0, 'resume_from_checkpoint': None, 'profiler': None, 'benchmark': None, 'reload_dataloaders_every_n_epochs': 0, 'auto_lr_find': False, 'replace_sampler_ddp': False, 'detect_anomaly': False, 'auto_scale_batch_size': False, 'plugins': None, 'amp_backend': None, 'amp_level': None, 'move_metrics_to_cpu': False, 'multiple_trainloader_mode': 'max_size_cycle', 'inference_mode': True, 'my_timestamp': '2024-09-09-10-46-50', 'betas': (0.9, 0.99), 'real_bsz': 8, 'run_name': '8000 ctx2048 L12 D768', 'my_pile_prev_p': 4} +{'zero_allow_untested_optimizer': True, 'zero_optimization': {'stage': 2, 'contiguous_gradients': True, 'overlap_comm': True, 'allgather_partitions': True, 'reduce_scatter': True, 'allgather_bucket_size': 2000000, 'reduce_bucket_size': 2000000, 'sub_group_size': 1000000000000}, 'activation_checkpointing': {'partition_activations': False, 'cpu_checkpointing': False, 'contiguous_memory_optimization': False, 'synchronize_checkpoint_boundary': False}, 'aio': {'block_size': 1048576, 'queue_depth': 8, 'single_submit': False, 'overlap_events': True, 'thread_count': 1}, 'gradient_accumulation_steps': 1, 'train_micro_batch_size_per_gpu': 8, 'gradient_clipping': 1.0, 'bf16': {'enabled': True}} +NEW RUN 2024-09-09-10-49-07 +{'load_model': 'out/L12-D768-x060-assholerwkv/rwkv-5.pth', 'wandb': 'tami4', 'proj_dir': 'out/L12-D768-x060-assholerwkv', 'random_seed': -1, 'train_type': '', 'data_file': 'data/pol_tokenized', 'data_type': 'binidx', 'vocab_size': 8000, 'ctx_len': 2048, 'epoch_steps': 5040, 'epoch_count': 584, 'epoch_begin': 6, 'epoch_save': 1, 'micro_bsz': 8, 'n_layer': 12, 'n_embd': 768, 'dim_att': 768, 'dim_ffn': 2688, 'pre_ffn': 0, 'head_qk': 0, 'tiny_att_dim': 0, 'tiny_att_layer': -999, 'lr_init': 0.0005, 'lr_final': 5e-05, 'warmup_steps': 10, 'beta1': 0.9, 'beta2': 0.99, 'adam_eps': 1e-08, 'grad_cp': 0, 'dropout': 0, 'weight_decay': 0.001, 'weight_decay_final': -1, 'my_pile_version': 1, 'my_pile_stage': 3, 'my_pile_shift': 0, 'my_pile_edecay': 0, 'layerwise_lr': 1, 'ds_bucket_mb': 2, 'my_sample_len': 0, 'my_ffn_shift': 1, 'my_att_shift': 1, 'head_size_a': 64, 'head_size_divisor': 8, 'my_pos_emb': 0, 'load_partial': 0, 'magic_prime': 23580569, 'my_qa_mask': 0, 'my_random_steps': 0, 'my_testing': 'x060', 'my_exit': 99999999, 'my_exit_tokens': 48293025696, 'logger': False, 'enable_checkpointing': False, 'default_root_dir': None, 'gradient_clip_val': 1.0, 'gradient_clip_algorithm': None, 'num_nodes': 1, 'num_processes': None, 'devices': '1', 'gpus': None, 'auto_select_gpus': None, 'tpu_cores': None, 'ipus': None, 'enable_progress_bar': True, 'overfit_batches': 0.0, 'track_grad_norm': -1, 'check_val_every_n_epoch': 100000000000000000000, 'fast_dev_run': False, 'accumulate_grad_batches': None, 'max_epochs': -1, 'min_epochs': None, 'max_steps': -1, 'min_steps': None, 'max_time': None, 'limit_train_batches': None, 'limit_val_batches': None, 'limit_test_batches': None, 'limit_predict_batches': None, 'val_check_interval': None, 'log_every_n_steps': 100000000000000000000, 'accelerator': 'gpu', 'strategy': 'deepspeed_stage_2', 'sync_batchnorm': False, 'precision': 'bf16', 'enable_model_summary': True, 'num_sanity_val_steps': 0, 'resume_from_checkpoint': None, 'profiler': None, 'benchmark': None, 'reload_dataloaders_every_n_epochs': 0, 'auto_lr_find': False, 'replace_sampler_ddp': False, 'detect_anomaly': False, 'auto_scale_batch_size': False, 'plugins': None, 'amp_backend': None, 'amp_level': None, 'move_metrics_to_cpu': False, 'multiple_trainloader_mode': 'max_size_cycle', 'inference_mode': True, 'my_timestamp': '2024-09-09-10-49-07', 'betas': (0.9, 0.99), 'real_bsz': 8, 'run_name': '8000 ctx2048 L12 D768', 'my_pile_prev_p': 4} +{'zero_allow_untested_optimizer': True, 'zero_optimization': {'stage': 2, 'contiguous_gradients': True, 'overlap_comm': True, 'allgather_partitions': True, 'reduce_scatter': True, 'allgather_bucket_size': 2000000, 'reduce_bucket_size': 2000000, 'sub_group_size': 1000000000000}, 'activation_checkpointing': {'partition_activations': False, 'cpu_checkpointing': False, 'contiguous_memory_optimization': False, 'synchronize_checkpoint_boundary': False}, 'aio': {'block_size': 1048576, 'queue_depth': 8, 'single_submit': False, 'overlap_events': True, 'thread_count': 1}, 'gradient_accumulation_steps': 1, 'train_micro_batch_size_per_gpu': 8, 'gradient_clipping': 1.0, 'bf16': {'enabled': True}} +NEW RUN 2024-09-09-11-20-40 +{'load_model': 'out/L12-D768-x060-assholerwkv/rwkv-5.pth', 'wandb': 'tami4', 'proj_dir': 'out/L12-D768-x060-assholerwkv', 'random_seed': -1, 'train_type': '', 'data_file': 'data/pol_tokenized', 'data_type': 'binidx', 'vocab_size': 8000, 'ctx_len': 2048, 'epoch_steps': 5040, 'epoch_count': 584, 'epoch_begin': 6, 'epoch_save': 1, 'micro_bsz': 8, 'n_layer': 12, 'n_embd': 768, 'dim_att': 768, 'dim_ffn': 2688, 'pre_ffn': 0, 'head_qk': 0, 'tiny_att_dim': 0, 'tiny_att_layer': -999, 'lr_init': 0.0005, 'lr_final': 5e-05, 'warmup_steps': 10, 'beta1': 0.9, 'beta2': 0.99, 'adam_eps': 1e-08, 'grad_cp': 0, 'dropout': 0, 'weight_decay': 0.001, 'weight_decay_final': -1, 'my_pile_version': 1, 'my_pile_stage': 3, 'my_pile_shift': 0, 'my_pile_edecay': 0, 'layerwise_lr': 1, 'ds_bucket_mb': 2, 'my_sample_len': 0, 'my_ffn_shift': 1, 'my_att_shift': 1, 'head_size_a': 64, 'head_size_divisor': 8, 'my_pos_emb': 0, 'load_partial': 0, 'magic_prime': 23580569, 'my_qa_mask': 0, 'my_random_steps': 0, 'my_testing': 'x060', 'my_exit': 99999999, 'my_exit_tokens': 48293025696, 'logger': False, 'enable_checkpointing': False, 'default_root_dir': None, 'gradient_clip_val': 1.0, 'gradient_clip_algorithm': None, 'num_nodes': 1, 'num_processes': None, 'devices': '1', 'gpus': None, 'auto_select_gpus': None, 'tpu_cores': None, 'ipus': None, 'enable_progress_bar': True, 'overfit_batches': 0.0, 'track_grad_norm': -1, 'check_val_every_n_epoch': 100000000000000000000, 'fast_dev_run': False, 'accumulate_grad_batches': None, 'max_epochs': -1, 'min_epochs': None, 'max_steps': -1, 'min_steps': None, 'max_time': None, 'limit_train_batches': None, 'limit_val_batches': None, 'limit_test_batches': None, 'limit_predict_batches': None, 'val_check_interval': None, 'log_every_n_steps': 100000000000000000000, 'accelerator': 'gpu', 'strategy': 'deepspeed_stage_2', 'sync_batchnorm': False, 'precision': 'bf16', 'enable_model_summary': True, 'num_sanity_val_steps': 0, 'resume_from_checkpoint': None, 'profiler': None, 'benchmark': None, 'reload_dataloaders_every_n_epochs': 0, 'auto_lr_find': False, 'replace_sampler_ddp': False, 'detect_anomaly': False, 'auto_scale_batch_size': False, 'plugins': None, 'amp_backend': None, 'amp_level': None, 'move_metrics_to_cpu': False, 'multiple_trainloader_mode': 'max_size_cycle', 'inference_mode': True, 'my_timestamp': '2024-09-09-11-20-40', 'betas': (0.9, 0.99), 'real_bsz': 8, 'run_name': '8000 ctx2048 L12 D768', 'my_pile_prev_p': 4} +{'zero_allow_untested_optimizer': True, 'zero_optimization': {'stage': 2, 'contiguous_gradients': True, 'overlap_comm': True, 'allgather_partitions': True, 'reduce_scatter': True, 'allgather_bucket_size': 2000000, 'reduce_bucket_size': 2000000, 'sub_group_size': 1000000000000}, 'activation_checkpointing': {'partition_activations': False, 'cpu_checkpointing': False, 'contiguous_memory_optimization': False, 'synchronize_checkpoint_boundary': False}, 'aio': {'block_size': 1048576, 'queue_depth': 8, 'single_submit': False, 'overlap_events': True, 'thread_count': 1}, 'gradient_accumulation_steps': 1, 'train_micro_batch_size_per_gpu': 8, 'gradient_clipping': 1.0, 'bf16': {'enabled': True}} +NEW RUN 2024-09-09-12-01-42 +{'load_model': 'out/L12-D768-x060-assholerwkv/rwkv-5.pth', 'wandb': 'tami4', 'proj_dir': 'out/L12-D768-x060-assholerwkv', 'random_seed': -1, 'train_type': '', 'data_file': 'data/pol_tokenized', 'data_type': 'binidx', 'vocab_size': 8000, 'ctx_len': 2048, 'epoch_steps': 40320, 'epoch_count': 584, 'epoch_begin': 6, 'epoch_save': 1, 'micro_bsz': 1, 'n_layer': 12, 'n_embd': 768, 'dim_att': 768, 'dim_ffn': 2688, 'pre_ffn': 0, 'head_qk': 0, 'tiny_att_dim': 0, 'tiny_att_layer': -999, 'lr_init': 0.0005, 'lr_final': 5e-05, 'warmup_steps': 10, 'beta1': 0.9, 'beta2': 0.99, 'adam_eps': 1e-08, 'grad_cp': 0, 'dropout': 0, 'weight_decay': 0.001, 'weight_decay_final': -1, 'my_pile_version': 1, 'my_pile_stage': 3, 'my_pile_shift': 0, 'my_pile_edecay': 0, 'layerwise_lr': 1, 'ds_bucket_mb': 2, 'my_sample_len': 0, 'my_ffn_shift': 1, 'my_att_shift': 1, 'head_size_a': 64, 'head_size_divisor': 8, 'my_pos_emb': 0, 'load_partial': 0, 'magic_prime': 23580569, 'my_qa_mask': 0, 'my_random_steps': 0, 'my_testing': 'x060', 'my_exit': 99999999, 'my_exit_tokens': 48293025696, 'logger': False, 'enable_checkpointing': False, 'default_root_dir': None, 'gradient_clip_val': 1.0, 'gradient_clip_algorithm': None, 'num_nodes': 1, 'num_processes': None, 'devices': '1', 'gpus': None, 'auto_select_gpus': None, 'tpu_cores': None, 'ipus': None, 'enable_progress_bar': True, 'overfit_batches': 0.0, 'track_grad_norm': -1, 'check_val_every_n_epoch': 100000000000000000000, 'fast_dev_run': False, 'accumulate_grad_batches': None, 'max_epochs': -1, 'min_epochs': None, 'max_steps': -1, 'min_steps': None, 'max_time': None, 'limit_train_batches': None, 'limit_val_batches': None, 'limit_test_batches': None, 'limit_predict_batches': None, 'val_check_interval': None, 'log_every_n_steps': 100000000000000000000, 'accelerator': 'gpu', 'strategy': 'deepspeed_stage_2', 'sync_batchnorm': False, 'precision': 'bf16', 'enable_model_summary': True, 'num_sanity_val_steps': 0, 'resume_from_checkpoint': None, 'profiler': None, 'benchmark': None, 'reload_dataloaders_every_n_epochs': 0, 'auto_lr_find': False, 'replace_sampler_ddp': False, 'detect_anomaly': False, 'auto_scale_batch_size': False, 'plugins': None, 'amp_backend': None, 'amp_level': None, 'move_metrics_to_cpu': False, 'multiple_trainloader_mode': 'max_size_cycle', 'inference_mode': True, 'my_timestamp': '2024-09-09-12-01-42', 'betas': (0.9, 0.99), 'real_bsz': 1, 'run_name': '8000 ctx2048 L12 D768', 'my_pile_prev_p': 4} +{'zero_allow_untested_optimizer': True, 'zero_optimization': {'stage': 2, 'contiguous_gradients': True, 'overlap_comm': True, 'allgather_partitions': True, 'reduce_scatter': True, 'allgather_bucket_size': 2000000, 'reduce_bucket_size': 2000000, 'sub_group_size': 1000000000000}, 'activation_checkpointing': {'partition_activations': False, 'cpu_checkpointing': False, 'contiguous_memory_optimization': False, 'synchronize_checkpoint_boundary': False}, 'aio': {'block_size': 1048576, 'queue_depth': 8, 'single_submit': False, 'overlap_events': True, 'thread_count': 1}, 'gradient_accumulation_steps': 1, 'train_micro_batch_size_per_gpu': 1, 'gradient_clipping': 1.0, 'bf16': {'enabled': True}} +NEW RUN 2024-09-09-13-54-16 +{'load_model': 'out/L12-D768-x060-assholerwkv/rwkv-5.pth', 'wandb': 'tami4', 'proj_dir': 'out/L12-D768-x060-assholerwkv', 'random_seed': -1, 'train_type': '', 'data_file': 'data/pol_tokenized', 'data_type': 'binidx', 'vocab_size': 8000, 'ctx_len': 2048, 'epoch_steps': 40320, 'epoch_count': 584, 'epoch_begin': 6, 'epoch_save': 1, 'micro_bsz': 1, 'n_layer': 12, 'n_embd': 768, 'dim_att': 768, 'dim_ffn': 2688, 'pre_ffn': 0, 'head_qk': 0, 'tiny_att_dim': 0, 'tiny_att_layer': -999, 'lr_init': 0.0005, 'lr_final': 5e-05, 'warmup_steps': 10, 'beta1': 0.9, 'beta2': 0.99, 'adam_eps': 1e-08, 'grad_cp': 0, 'dropout': 0, 'weight_decay': 0.001, 'weight_decay_final': -1, 'my_pile_version': 1, 'my_pile_stage': 3, 'my_pile_shift': 0, 'my_pile_edecay': 0, 'layerwise_lr': 1, 'ds_bucket_mb': 2, 'my_sample_len': 0, 'my_ffn_shift': 1, 'my_att_shift': 1, 'head_size_a': 64, 'head_size_divisor': 8, 'my_pos_emb': 0, 'load_partial': 0, 'magic_prime': 23580569, 'my_qa_mask': 0, 'my_random_steps': 0, 'my_testing': 'x060', 'my_exit': 99999999, 'my_exit_tokens': 48293025696, 'logger': False, 'enable_checkpointing': False, 'default_root_dir': None, 'gradient_clip_val': 1.0, 'gradient_clip_algorithm': None, 'num_nodes': 1, 'num_processes': None, 'devices': '1', 'gpus': None, 'auto_select_gpus': None, 'tpu_cores': None, 'ipus': None, 'enable_progress_bar': True, 'overfit_batches': 0.0, 'track_grad_norm': -1, 'check_val_every_n_epoch': 100000000000000000000, 'fast_dev_run': False, 'accumulate_grad_batches': None, 'max_epochs': -1, 'min_epochs': None, 'max_steps': -1, 'min_steps': None, 'max_time': None, 'limit_train_batches': None, 'limit_val_batches': None, 'limit_test_batches': None, 'limit_predict_batches': None, 'val_check_interval': None, 'log_every_n_steps': 100000000000000000000, 'accelerator': 'gpu', 'strategy': 'auto', 'sync_batchnorm': False, 'precision': 'bf16', 'enable_model_summary': True, 'num_sanity_val_steps': 0, 'resume_from_checkpoint': None, 'profiler': None, 'benchmark': None, 'reload_dataloaders_every_n_epochs': 0, 'auto_lr_find': False, 'replace_sampler_ddp': False, 'detect_anomaly': False, 'auto_scale_batch_size': False, 'plugins': None, 'amp_backend': None, 'amp_level': None, 'move_metrics_to_cpu': False, 'multiple_trainloader_mode': 'max_size_cycle', 'inference_mode': True, 'my_timestamp': '2024-09-09-13-54-16', 'betas': (0.9, 0.99), 'real_bsz': 1, 'run_name': '8000 ctx2048 L12 D768', 'my_pile_prev_p': 4} +NEW RUN 2024-09-09-14-00-46 +{'load_model': 'out/L12-D768-x060-assholerwkv/rwkv-5.pth', 'wandb': 'tami4', 'proj_dir': 'out/L12-D768-x060-assholerwkv', 'random_seed': -1, 'train_type': '', 'data_file': 'data/pol_tokenized', 'data_type': 'binidx', 'vocab_size': 8000, 'ctx_len': 2048, 'epoch_steps': 40320, 'epoch_count': 584, 'epoch_begin': 6, 'epoch_save': 1, 'micro_bsz': 1, 'n_layer': 12, 'n_embd': 768, 'dim_att': 768, 'dim_ffn': 2688, 'pre_ffn': 0, 'head_qk': 0, 'tiny_att_dim': 0, 'tiny_att_layer': -999, 'lr_init': 0.0005, 'lr_final': 5e-05, 'warmup_steps': 10, 'beta1': 0.9, 'beta2': 0.99, 'adam_eps': 1e-08, 'grad_cp': 1, 'dropout': 0, 'weight_decay': 0.001, 'weight_decay_final': -1, 'my_pile_version': 1, 'my_pile_stage': 3, 'my_pile_shift': 0, 'my_pile_edecay': 0, 'layerwise_lr': 1, 'ds_bucket_mb': 2, 'my_sample_len': 0, 'my_ffn_shift': 1, 'my_att_shift': 1, 'head_size_a': 64, 'head_size_divisor': 8, 'my_pos_emb': 0, 'load_partial': 0, 'magic_prime': 23580569, 'my_qa_mask': 0, 'my_random_steps': 0, 'my_testing': 'x060', 'my_exit': 99999999, 'my_exit_tokens': 48293025696, 'logger': False, 'enable_checkpointing': False, 'default_root_dir': None, 'gradient_clip_val': 1.0, 'gradient_clip_algorithm': None, 'num_nodes': 1, 'num_processes': None, 'devices': '1', 'gpus': None, 'auto_select_gpus': None, 'tpu_cores': None, 'ipus': None, 'enable_progress_bar': True, 'overfit_batches': 0.0, 'track_grad_norm': -1, 'check_val_every_n_epoch': 100000000000000000000, 'fast_dev_run': False, 'accumulate_grad_batches': None, 'max_epochs': -1, 'min_epochs': None, 'max_steps': -1, 'min_steps': None, 'max_time': None, 'limit_train_batches': None, 'limit_val_batches': None, 'limit_test_batches': None, 'limit_predict_batches': None, 'val_check_interval': None, 'log_every_n_steps': 100000000000000000000, 'accelerator': 'gpu', 'strategy': 'deepspeed_stage_2', 'sync_batchnorm': False, 'precision': 'bf16', 'enable_model_summary': True, 'num_sanity_val_steps': 0, 'resume_from_checkpoint': None, 'profiler': None, 'benchmark': None, 'reload_dataloaders_every_n_epochs': 0, 'auto_lr_find': False, 'replace_sampler_ddp': False, 'detect_anomaly': False, 'auto_scale_batch_size': False, 'plugins': None, 'amp_backend': None, 'amp_level': None, 'move_metrics_to_cpu': False, 'multiple_trainloader_mode': 'max_size_cycle', 'inference_mode': True, 'my_timestamp': '2024-09-09-14-00-46', 'betas': (0.9, 0.99), 'real_bsz': 1, 'run_name': '8000 ctx2048 L12 D768', 'my_pile_prev_p': 4} +{'zero_allow_untested_optimizer': True, 'zero_optimization': {'stage': 2, 'contiguous_gradients': True, 'overlap_comm': True, 'allgather_partitions': True, 'reduce_scatter': True, 'allgather_bucket_size': 2000000, 'reduce_bucket_size': 2000000, 'sub_group_size': 1000000000000}, 'activation_checkpointing': {'partition_activations': False, 'cpu_checkpointing': False, 'contiguous_memory_optimization': False, 'synchronize_checkpoint_boundary': False}, 'aio': {'block_size': 1048576, 'queue_depth': 8, 'single_submit': False, 'overlap_events': True, 'thread_count': 1}, 'gradient_accumulation_steps': 1, 'train_micro_batch_size_per_gpu': 1, 'gradient_clipping': 1.0, 'bf16': {'enabled': True}} +NEW RUN 2024-09-09-14-03-55 +{'load_model': 'out/L12-D768-x060-assholerwkv/rwkv-5.pth', 'wandb': 'tami4', 'proj_dir': 'out/L12-D768-x060-assholerwkv', 'random_seed': -1, 'train_type': '', 'data_file': 'data/pol_tokenized', 'data_type': 'binidx', 'vocab_size': 8000, 'ctx_len': 2048, 'epoch_steps': 40320, 'epoch_count': 584, 'epoch_begin': 6, 'epoch_save': 1, 'micro_bsz': 1, 'n_layer': 12, 'n_embd': 768, 'dim_att': 768, 'dim_ffn': 2688, 'pre_ffn': 0, 'head_qk': 0, 'tiny_att_dim': 0, 'tiny_att_layer': -999, 'lr_init': 0.0005, 'lr_final': 5e-05, 'warmup_steps': 10, 'beta1': 0.9, 'beta2': 0.99, 'adam_eps': 1e-08, 'grad_cp': 1, 'dropout': 0, 'weight_decay': 0.001, 'weight_decay_final': -1, 'my_pile_version': 1, 'my_pile_stage': 3, 'my_pile_shift': 0, 'my_pile_edecay': 0, 'layerwise_lr': 1, 'ds_bucket_mb': 2, 'my_sample_len': 0, 'my_ffn_shift': 1, 'my_att_shift': 1, 'head_size_a': 64, 'head_size_divisor': 8, 'my_pos_emb': 0, 'load_partial': 0, 'magic_prime': 23580569, 'my_qa_mask': 0, 'my_random_steps': 0, 'my_testing': 'x060', 'my_exit': 99999999, 'my_exit_tokens': 48293025696, 'logger': False, 'enable_checkpointing': False, 'default_root_dir': None, 'gradient_clip_val': 1.0, 'gradient_clip_algorithm': None, 'num_nodes': 1, 'num_processes': None, 'devices': '1', 'gpus': None, 'auto_select_gpus': None, 'tpu_cores': None, 'ipus': None, 'enable_progress_bar': True, 'overfit_batches': 0.0, 'track_grad_norm': -1, 'check_val_every_n_epoch': 100000000000000000000, 'fast_dev_run': False, 'accumulate_grad_batches': None, 'max_epochs': -1, 'min_epochs': None, 'max_steps': -1, 'min_steps': None, 'max_time': None, 'limit_train_batches': None, 'limit_val_batches': None, 'limit_test_batches': None, 'limit_predict_batches': None, 'val_check_interval': None, 'log_every_n_steps': 100000000000000000000, 'accelerator': 'gpu', 'strategy': 'deepspeed_stage_2', 'sync_batchnorm': False, 'precision': 'bf16', 'enable_model_summary': True, 'num_sanity_val_steps': 0, 'resume_from_checkpoint': None, 'profiler': None, 'benchmark': None, 'reload_dataloaders_every_n_epochs': 0, 'auto_lr_find': False, 'replace_sampler_ddp': False, 'detect_anomaly': False, 'auto_scale_batch_size': False, 'plugins': None, 'amp_backend': None, 'amp_level': None, 'move_metrics_to_cpu': False, 'multiple_trainloader_mode': 'max_size_cycle', 'inference_mode': True, 'my_timestamp': '2024-09-09-14-03-55', 'betas': (0.9, 0.99), 'real_bsz': 1, 'run_name': '8000 ctx2048 L12 D768', 'my_pile_prev_p': 4} +{'zero_allow_untested_optimizer': True, 'zero_optimization': {'stage': 2, 'contiguous_gradients': True, 'overlap_comm': True, 'allgather_partitions': True, 'reduce_scatter': True, 'allgather_bucket_size': 2000000, 'reduce_bucket_size': 2000000, 'sub_group_size': 1000000000000}, 'activation_checkpointing': {'partition_activations': False, 'cpu_checkpointing': False, 'contiguous_memory_optimization': False, 'synchronize_checkpoint_boundary': False}, 'aio': {'block_size': 1048576, 'queue_depth': 8, 'single_submit': False, 'overlap_events': True, 'thread_count': 1}, 'gradient_accumulation_steps': 1, 'train_micro_batch_size_per_gpu': 1, 'gradient_clipping': 1.0, 'bf16': {'enabled': True}} +NEW RUN 2024-09-09-21-55-20 +{'load_model': 'out/L12-D768-x060-assholerwkv/rwkv-5.pth', 'wandb': 'tami4', 'proj_dir': 'out/L12-D768-x060-assholerwkv', 'random_seed': -1, 'train_type': '', 'data_file': 'data/pol_tokenized', 'data_type': 'binidx', 'vocab_size': 8000, 'ctx_len': 2048, 'epoch_steps': 40320, 'epoch_count': 584, 'epoch_begin': 6, 'epoch_save': 1, 'micro_bsz': 1, 'n_layer': 12, 'n_embd': 768, 'dim_att': 768, 'dim_ffn': 2688, 'pre_ffn': 0, 'head_qk': 0, 'tiny_att_dim': 0, 'tiny_att_layer': -999, 'lr_init': 0.0005, 'lr_final': 5e-05, 'warmup_steps': 10, 'beta1': 0.9, 'beta2': 0.99, 'adam_eps': 1e-24, 'grad_cp': 1, 'dropout': 0, 'weight_decay': 0.001, 'weight_decay_final': -1, 'my_pile_version': 1, 'my_pile_stage': 3, 'my_pile_shift': 0, 'my_pile_edecay': 0, 'layerwise_lr': 1, 'ds_bucket_mb': 2, 'my_sample_len': 0, 'my_ffn_shift': 1, 'my_att_shift': 1, 'head_size_a': 64, 'head_size_divisor': 8, 'my_pos_emb': 0, 'load_partial': 0, 'magic_prime': 23580569, 'my_qa_mask': 0, 'my_random_steps': 0, 'my_testing': 'x060', 'my_exit': 99999999, 'my_exit_tokens': 48293025696, 'logger': False, 'enable_checkpointing': False, 'default_root_dir': None, 'gradient_clip_val': 1.0, 'gradient_clip_algorithm': None, 'num_nodes': 1, 'num_processes': None, 'devices': '1', 'gpus': None, 'auto_select_gpus': None, 'tpu_cores': None, 'ipus': None, 'enable_progress_bar': True, 'overfit_batches': 0.0, 'track_grad_norm': -1, 'check_val_every_n_epoch': 100000000000000000000, 'fast_dev_run': False, 'accumulate_grad_batches': None, 'max_epochs': -1, 'min_epochs': None, 'max_steps': -1, 'min_steps': None, 'max_time': None, 'limit_train_batches': None, 'limit_val_batches': None, 'limit_test_batches': None, 'limit_predict_batches': None, 'val_check_interval': None, 'log_every_n_steps': 100000000000000000000, 'accelerator': 'gpu', 'strategy': 'deepspeed_stage_2', 'sync_batchnorm': False, 'precision': 'bf16', 'enable_model_summary': True, 'num_sanity_val_steps': 0, 'resume_from_checkpoint': None, 'profiler': None, 'benchmark': None, 'reload_dataloaders_every_n_epochs': 0, 'auto_lr_find': False, 'replace_sampler_ddp': False, 'detect_anomaly': False, 'auto_scale_batch_size': False, 'plugins': None, 'amp_backend': None, 'amp_level': None, 'move_metrics_to_cpu': False, 'multiple_trainloader_mode': 'max_size_cycle', 'inference_mode': True, 'my_timestamp': '2024-09-09-21-55-20', 'betas': (0.9, 0.99), 'real_bsz': 1, 'run_name': '8000 ctx2048 L12 D768', 'my_pile_prev_p': 4} +{'zero_allow_untested_optimizer': True, 'zero_optimization': {'stage': 2, 'contiguous_gradients': True, 'overlap_comm': True, 'allgather_partitions': True, 'reduce_scatter': True, 'allgather_bucket_size': 2000000, 'reduce_bucket_size': 2000000, 'sub_group_size': 1000000000000}, 'activation_checkpointing': {'partition_activations': False, 'cpu_checkpointing': False, 'contiguous_memory_optimization': False, 'synchronize_checkpoint_boundary': False}, 'aio': {'block_size': 1048576, 'queue_depth': 8, 'single_submit': False, 'overlap_events': True, 'thread_count': 1}, 'gradient_accumulation_steps': 1, 'train_micro_batch_size_per_gpu': 1, 'gradient_clipping': 1.0, 'bf16': {'enabled': True}} +NEW RUN 2024-09-09-21-57-09 +{'load_model': 'out/L12-D768-x060-assholerwkv/rwkv-5.pth', 'wandb': 'tami4', 'proj_dir': 'out/L12-D768-x060-assholerwkv', 'random_seed': -1, 'train_type': '', 'data_file': 'data/pol_tokenized', 'data_type': 'binidx', 'vocab_size': 8000, 'ctx_len': 2048, 'epoch_steps': 2520, 'epoch_count': 584, 'epoch_begin': 6, 'epoch_save': 1, 'micro_bsz': 16, 'n_layer': 12, 'n_embd': 768, 'dim_att': 768, 'dim_ffn': 2688, 'pre_ffn': 0, 'head_qk': 0, 'tiny_att_dim': 0, 'tiny_att_layer': -999, 'lr_init': 0.0005, 'lr_final': 5e-05, 'warmup_steps': 10, 'beta1': 0.9, 'beta2': 0.99, 'adam_eps': 1e-24, 'grad_cp': 0, 'dropout': 0, 'weight_decay': 0.001, 'weight_decay_final': -1, 'my_pile_version': 1, 'my_pile_stage': 3, 'my_pile_shift': 0, 'my_pile_edecay': 0, 'layerwise_lr': 1, 'ds_bucket_mb': 2, 'my_sample_len': 0, 'my_ffn_shift': 1, 'my_att_shift': 1, 'head_size_a': 64, 'head_size_divisor': 8, 'my_pos_emb': 0, 'load_partial': 0, 'magic_prime': 23580569, 'my_qa_mask': 0, 'my_random_steps': 0, 'my_testing': 'x060', 'my_exit': 99999999, 'my_exit_tokens': 48293025696, 'logger': False, 'enable_checkpointing': False, 'default_root_dir': None, 'gradient_clip_val': 1.0, 'gradient_clip_algorithm': None, 'num_nodes': 1, 'num_processes': None, 'devices': '1', 'gpus': None, 'auto_select_gpus': None, 'tpu_cores': None, 'ipus': None, 'enable_progress_bar': True, 'overfit_batches': 0.0, 'track_grad_norm': -1, 'check_val_every_n_epoch': 100000000000000000000, 'fast_dev_run': False, 'accumulate_grad_batches': None, 'max_epochs': -1, 'min_epochs': None, 'max_steps': -1, 'min_steps': None, 'max_time': None, 'limit_train_batches': None, 'limit_val_batches': None, 'limit_test_batches': None, 'limit_predict_batches': None, 'val_check_interval': None, 'log_every_n_steps': 100000000000000000000, 'accelerator': 'gpu', 'strategy': 'deepspeed_stage_2', 'sync_batchnorm': False, 'precision': 'bf16', 'enable_model_summary': True, 'num_sanity_val_steps': 0, 'resume_from_checkpoint': None, 'profiler': None, 'benchmark': None, 'reload_dataloaders_every_n_epochs': 0, 'auto_lr_find': False, 'replace_sampler_ddp': False, 'detect_anomaly': False, 'auto_scale_batch_size': False, 'plugins': None, 'amp_backend': None, 'amp_level': None, 'move_metrics_to_cpu': False, 'multiple_trainloader_mode': 'max_size_cycle', 'inference_mode': True, 'my_timestamp': '2024-09-09-21-57-09', 'betas': (0.9, 0.99), 'real_bsz': 16, 'run_name': '8000 ctx2048 L12 D768', 'my_pile_prev_p': 4} +{'zero_allow_untested_optimizer': True, 'zero_optimization': {'stage': 2, 'contiguous_gradients': True, 'overlap_comm': True, 'allgather_partitions': True, 'reduce_scatter': True, 'allgather_bucket_size': 2000000, 'reduce_bucket_size': 2000000, 'sub_group_size': 1000000000000}, 'activation_checkpointing': {'partition_activations': False, 'cpu_checkpointing': False, 'contiguous_memory_optimization': False, 'synchronize_checkpoint_boundary': False}, 'aio': {'block_size': 1048576, 'queue_depth': 8, 'single_submit': False, 'overlap_events': True, 'thread_count': 1}, 'gradient_accumulation_steps': 1, 'train_micro_batch_size_per_gpu': 16, 'gradient_clipping': 1.0, 'bf16': {'enabled': True}} +NEW RUN 2024-09-09-21-57-50 +{'load_model': 'out/L12-D768-x060-assholerwkv/rwkv-5.pth', 'wandb': 'tami4', 'proj_dir': 'out/L12-D768-x060-assholerwkv', 'random_seed': -1, 'train_type': '', 'data_file': 'data/pol_tokenized', 'data_type': 'binidx', 'vocab_size': 8000, 'ctx_len': 2048, 'epoch_steps': 2880, 'epoch_count': 584, 'epoch_begin': 6, 'epoch_save': 1, 'micro_bsz': 14, 'n_layer': 12, 'n_embd': 768, 'dim_att': 768, 'dim_ffn': 2688, 'pre_ffn': 0, 'head_qk': 0, 'tiny_att_dim': 0, 'tiny_att_layer': -999, 'lr_init': 0.0005, 'lr_final': 5e-05, 'warmup_steps': 10, 'beta1': 0.9, 'beta2': 0.99, 'adam_eps': 1e-24, 'grad_cp': 0, 'dropout': 0, 'weight_decay': 0.001, 'weight_decay_final': -1, 'my_pile_version': 1, 'my_pile_stage': 3, 'my_pile_shift': 0, 'my_pile_edecay': 0, 'layerwise_lr': 1, 'ds_bucket_mb': 2, 'my_sample_len': 0, 'my_ffn_shift': 1, 'my_att_shift': 1, 'head_size_a': 64, 'head_size_divisor': 8, 'my_pos_emb': 0, 'load_partial': 0, 'magic_prime': 23580569, 'my_qa_mask': 0, 'my_random_steps': 0, 'my_testing': 'x060', 'my_exit': 99999999, 'my_exit_tokens': 48293025696, 'logger': False, 'enable_checkpointing': False, 'default_root_dir': None, 'gradient_clip_val': 1.0, 'gradient_clip_algorithm': None, 'num_nodes': 1, 'num_processes': None, 'devices': '1', 'gpus': None, 'auto_select_gpus': None, 'tpu_cores': None, 'ipus': None, 'enable_progress_bar': True, 'overfit_batches': 0.0, 'track_grad_norm': -1, 'check_val_every_n_epoch': 100000000000000000000, 'fast_dev_run': False, 'accumulate_grad_batches': None, 'max_epochs': -1, 'min_epochs': None, 'max_steps': -1, 'min_steps': None, 'max_time': None, 'limit_train_batches': None, 'limit_val_batches': None, 'limit_test_batches': None, 'limit_predict_batches': None, 'val_check_interval': None, 'log_every_n_steps': 100000000000000000000, 'accelerator': 'gpu', 'strategy': 'deepspeed_stage_2', 'sync_batchnorm': False, 'precision': 'bf16', 'enable_model_summary': True, 'num_sanity_val_steps': 0, 'resume_from_checkpoint': None, 'profiler': None, 'benchmark': None, 'reload_dataloaders_every_n_epochs': 0, 'auto_lr_find': False, 'replace_sampler_ddp': False, 'detect_anomaly': False, 'auto_scale_batch_size': False, 'plugins': None, 'amp_backend': None, 'amp_level': None, 'move_metrics_to_cpu': False, 'multiple_trainloader_mode': 'max_size_cycle', 'inference_mode': True, 'my_timestamp': '2024-09-09-21-57-50', 'betas': (0.9, 0.99), 'real_bsz': 14, 'run_name': '8000 ctx2048 L12 D768', 'my_pile_prev_p': 4} +{'zero_allow_untested_optimizer': True, 'zero_optimization': {'stage': 2, 'contiguous_gradients': True, 'overlap_comm': True, 'allgather_partitions': True, 'reduce_scatter': True, 'allgather_bucket_size': 2000000, 'reduce_bucket_size': 2000000, 'sub_group_size': 1000000000000}, 'activation_checkpointing': {'partition_activations': False, 'cpu_checkpointing': False, 'contiguous_memory_optimization': False, 'synchronize_checkpoint_boundary': False}, 'aio': {'block_size': 1048576, 'queue_depth': 8, 'single_submit': False, 'overlap_events': True, 'thread_count': 1}, 'gradient_accumulation_steps': 1, 'train_micro_batch_size_per_gpu': 14, 'gradient_clipping': 1.0, 'bf16': {'enabled': True}} +NEW RUN 2024-09-12-19-58-13 +{'load_model': 'out/L12-D768-x060-assholerwkv/rwkv-5.pth', 'wandb': 'tami4', 'proj_dir': 'out/L12-D768-x060-assholerwkv', 'random_seed': -1, 'train_type': '', 'data_file': 'data/pol_tokenized', 'data_type': 'binidx', 'vocab_size': 8000, 'ctx_len': 2048, 'epoch_steps': 2880, 'epoch_count': 584, 'epoch_begin': 6, 'epoch_save': 1, 'micro_bsz': 14, 'n_layer': 12, 'n_embd': 768, 'dim_att': 768, 'dim_ffn': 2688, 'pre_ffn': 0, 'head_qk': 0, 'tiny_att_dim': 0, 'tiny_att_layer': -999, 'lr_init': 0.0005, 'lr_final': 5e-05, 'warmup_steps': 10, 'beta1': 0.9, 'beta2': 0.99, 'adam_eps': 1e-24, 'grad_cp': 0, 'dropout': 0, 'weight_decay': 0.001, 'weight_decay_final': -1, 'my_pile_version': 1, 'my_pile_stage': 3, 'my_pile_shift': 0, 'my_pile_edecay': 0, 'layerwise_lr': 1, 'ds_bucket_mb': 2, 'my_sample_len': 0, 'my_ffn_shift': 1, 'my_att_shift': 1, 'head_size_a': 64, 'head_size_divisor': 8, 'my_pos_emb': 0, 'load_partial': 0, 'magic_prime': 23580569, 'my_qa_mask': 0, 'my_random_steps': 0, 'my_testing': 'x060', 'my_exit': 99999999, 'my_exit_tokens': 48293025696, 'logger': False, 'enable_checkpointing': False, 'default_root_dir': None, 'gradient_clip_val': 1.0, 'gradient_clip_algorithm': None, 'num_nodes': 1, 'num_processes': None, 'devices': '1', 'gpus': None, 'auto_select_gpus': None, 'tpu_cores': None, 'ipus': None, 'enable_progress_bar': True, 'overfit_batches': 0.0, 'track_grad_norm': -1, 'check_val_every_n_epoch': 100000000000000000000, 'fast_dev_run': False, 'accumulate_grad_batches': None, 'max_epochs': -1, 'min_epochs': None, 'max_steps': -1, 'min_steps': None, 'max_time': None, 'limit_train_batches': None, 'limit_val_batches': None, 'limit_test_batches': None, 'limit_predict_batches': None, 'val_check_interval': None, 'log_every_n_steps': 100000000000000000000, 'accelerator': 'gpu', 'strategy': 'deepspeed_stage_2', 'sync_batchnorm': False, 'precision': 'bf16', 'enable_model_summary': True, 'num_sanity_val_steps': 0, 'resume_from_checkpoint': None, 'profiler': None, 'benchmark': None, 'reload_dataloaders_every_n_epochs': 0, 'auto_lr_find': False, 'replace_sampler_ddp': False, 'detect_anomaly': False, 'auto_scale_batch_size': False, 'plugins': None, 'amp_backend': None, 'amp_level': None, 'move_metrics_to_cpu': False, 'multiple_trainloader_mode': 'max_size_cycle', 'inference_mode': True, 'my_timestamp': '2024-09-12-19-58-13', 'betas': (0.9, 0.99), 'real_bsz': 14, 'run_name': '8000 ctx2048 L12 D768', 'my_pile_prev_p': 4} +{'zero_allow_untested_optimizer': True, 'zero_optimization': {'stage': 2, 'contiguous_gradients': True, 'overlap_comm': True, 'allgather_partitions': True, 'reduce_scatter': True, 'allgather_bucket_size': 2000000, 'reduce_bucket_size': 2000000, 'sub_group_size': 1000000000000}, 'activation_checkpointing': {'partition_activations': False, 'cpu_checkpointing': False, 'contiguous_memory_optimization': False, 'synchronize_checkpoint_boundary': False}, 'aio': {'block_size': 1048576, 'queue_depth': 8, 'single_submit': False, 'overlap_events': True, 'thread_count': 1}, 'gradient_accumulation_steps': 1, 'train_micro_batch_size_per_gpu': 14, 'gradient_clipping': 1.0, 'bf16': {'enabled': True}} +6 2.654853 14.2229 0.00049984 2024-09-12 23:06:17.835881 0 +7 2.641623 14.0360 0.00049979 2024-09-13 02:14:16.096213 1 +8 2.612405 13.6318 0.00049974 2024-09-13 05:22:10.551701 2 +9 2.594084 13.3843 0.00049968 2024-09-13 08:30:05.003704 3 +10 2.576237 13.1476 0.00049961 2024-09-13 11:38:01.478437 4 +11 2.711689 15.0547 0.00049953 2024-09-13 14:45:29.679475 5 +12 2.542863 12.7160 0.00049945 2024-09-13 17:53:28.299187 6 +13 2.533426 12.5966 0.00049936 2024-09-13 21:01:24.329516 7 +14 2.517011 12.3915 0.00049927 2024-09-14 00:09:17.050096 8 +15 2.512259 12.3328 0.00049917 2024-09-14 03:17:16.006779 9 +16 2.496528 12.1403 0.00049906 2024-09-14 06:25:14.057159 10 +17 2.480046 11.9418 0.00049895 2024-09-14 09:33:11.532131 11 +18 2.480027 11.9416 0.00049883 2024-09-14 12:41:08.744158 12 +19 2.469632 11.8181 0.00049870 2024-09-14 15:49:06.510603 13 +20 2.459296 11.6966 0.00049857 2024-09-14 18:56:58.642568 14 +21 2.449449 11.5820 0.00049843 2024-09-14 22:05:02.706918 15 +22 2.446061 11.5428 0.00049829 2024-09-15 01:13:07.939236 16 +NEW RUN 2024-09-16-10-14-20 +{'load_model': 'out/L12-D768-x060-assholerwkv/rwkv-22.pth', 'wandb': 'tami4', 'proj_dir': 'out/L12-D768-x060-assholerwkv', 'random_seed': -1, 'train_type': '', 'data_file': 'data/pol_tokenized', 'data_type': 'binidx', 'vocab_size': 8000, 'ctx_len': 2048, 'epoch_steps': 2880, 'epoch_count': 584, 'epoch_begin': 23, 'epoch_save': 1, 'micro_bsz': 14, 'n_layer': 12, 'n_embd': 768, 'dim_att': 768, 'dim_ffn': 2688, 'pre_ffn': 0, 'head_qk': 0, 'tiny_att_dim': 0, 'tiny_att_layer': -999, 'lr_init': 0.0005, 'lr_final': 5e-05, 'warmup_steps': 10, 'beta1': 0.9, 'beta2': 0.99, 'adam_eps': 1e-24, 'grad_cp': 0, 'dropout': 0, 'weight_decay': 0.001, 'weight_decay_final': -1, 'my_pile_version': 1, 'my_pile_stage': 3, 'my_pile_shift': 0, 'my_pile_edecay': 0, 'layerwise_lr': 1, 'ds_bucket_mb': 2, 'my_sample_len': 0, 'my_ffn_shift': 1, 'my_att_shift': 1, 'head_size_a': 64, 'head_size_divisor': 8, 'my_pos_emb': 0, 'load_partial': 0, 'magic_prime': 23580569, 'my_qa_mask': 0, 'my_random_steps': 0, 'my_testing': 'x060', 'my_exit': 99999999, 'my_exit_tokens': 48293025696, 'logger': False, 'enable_checkpointing': False, 'default_root_dir': None, 'gradient_clip_val': 1.0, 'gradient_clip_algorithm': None, 'num_nodes': 1, 'num_processes': None, 'devices': '1', 'gpus': None, 'auto_select_gpus': None, 'tpu_cores': None, 'ipus': None, 'enable_progress_bar': True, 'overfit_batches': 0.0, 'track_grad_norm': -1, 'check_val_every_n_epoch': 100000000000000000000, 'fast_dev_run': False, 'accumulate_grad_batches': None, 'max_epochs': -1, 'min_epochs': None, 'max_steps': -1, 'min_steps': None, 'max_time': None, 'limit_train_batches': None, 'limit_val_batches': None, 'limit_test_batches': None, 'limit_predict_batches': None, 'val_check_interval': None, 'log_every_n_steps': 100000000000000000000, 'accelerator': 'gpu', 'strategy': 'deepspeed_stage_2', 'sync_batchnorm': False, 'precision': 'bf16', 'enable_model_summary': True, 'num_sanity_val_steps': 0, 'resume_from_checkpoint': None, 'profiler': None, 'benchmark': None, 'reload_dataloaders_every_n_epochs': 0, 'auto_lr_find': False, 'replace_sampler_ddp': False, 'detect_anomaly': False, 'auto_scale_batch_size': False, 'plugins': None, 'amp_backend': None, 'amp_level': None, 'move_metrics_to_cpu': False, 'multiple_trainloader_mode': 'max_size_cycle', 'inference_mode': True, 'my_timestamp': '2024-09-16-10-14-20', 'betas': (0.9, 0.99), 'real_bsz': 14, 'run_name': '8000 ctx2048 L12 D768', 'my_pile_prev_p': 21} +{'zero_allow_untested_optimizer': True, 'zero_optimization': {'stage': 2, 'contiguous_gradients': True, 'overlap_comm': True, 'allgather_partitions': True, 'reduce_scatter': True, 'allgather_bucket_size': 2000000, 'reduce_bucket_size': 2000000, 'sub_group_size': 1000000000000}, 'activation_checkpointing': {'partition_activations': False, 'cpu_checkpointing': False, 'contiguous_memory_optimization': False, 'synchronize_checkpoint_boundary': False}, 'aio': {'block_size': 1048576, 'queue_depth': 8, 'single_submit': False, 'overlap_events': True, 'thread_count': 1}, 'gradient_accumulation_steps': 1, 'train_micro_batch_size_per_gpu': 14, 'gradient_clipping': 1.0, 'bf16': {'enabled': True}} +23 3.308922 27.3556 0.00049813 2024-09-16 13:22:24.557750 0 +24 2.444455 11.5243 0.00049797 2024-09-16 16:30:22.364825 1 +25 2.445294 11.5339 0.00049781 2024-09-16 19:38:19.567266 2 +26 2.435292 11.4192 0.00049764 2024-09-16 22:46:22.491002 3 +27 2.429837 11.3570 0.00049746 2024-09-17 01:54:19.619135 4 +28 2.435192 11.4180 0.00049728 2024-09-17 05:02:16.503676 5 +29 2.425060 11.3029 0.00049709 2024-09-17 08:10:13.527013 6 +30 2.417044 11.2127 0.00049689 2024-09-17 11:18:11.273752 7 +31 2.410162 11.1358 0.00049668 2024-09-17 14:26:08.658944 8 +32 2.404929 11.0776 0.00049647 2024-09-17 17:34:05.996774 9 +33 2.410824 11.1431 0.00049626 2024-09-17 20:42:03.626259 10 +34 2.409201 11.1251 0.00049604 2024-09-17 23:50:00.862901 11 +35 2.396419 10.9838 0.00049581 2024-09-18 02:57:57.315197 12 +36 2.398947 11.0116 0.00049557 2024-09-18 06:05:55.095674 13 +37 2.399094 11.0132 0.00049533 2024-09-18 09:13:53.202341 14 +38 2.389931 10.9127 0.00049508 2024-09-18 12:21:50.729821 15 +39 2.388270 10.8946 0.00049483 2024-09-18 15:29:48.579447 16 +40 2.379538 10.7999 0.00049457 2024-09-18 18:37:48.158962 17 +41 2.383583 10.8437 0.00049430 2024-09-18 21:45:47.381188 18 +42 2.380873 10.8143 0.00049403 2024-09-19 00:53:45.132135 19 +43 2.381291 10.8189 0.00049375 2024-09-19 04:01:43.702466 20 +44 2.367386 10.6695 0.00049346 2024-09-19 07:09:41.603293 21 +45 2.373619 10.7362 0.00049317 2024-09-19 10:17:40.337682 22 +46 2.365346 10.6477 0.00049287 2024-09-19 13:25:40.177298 23 +47 2.366740 10.6626 0.00049256 2024-09-19 16:33:38.957470 24 +48 2.357674 10.5663 0.00049225 2024-09-19 19:41:38.082875 25 +49 2.359804 10.5889 0.00049193 2024-09-19 22:49:39.617898 26 +50 2.353716 10.5246 0.00049161 2024-09-20 01:57:41.389806 27 +51 2.357576 10.5653 0.00049128 2024-09-20 05:05:43.636208 28 +52 2.360647 10.5978 0.00049094 2024-09-20 08:13:45.886391 29 +53 2.351457 10.5009 0.00049060 2024-09-20 11:21:47.930614 30 +54 2.344618 10.4293 0.00049025 2024-09-20 14:29:49.152955 31 +55 2.352412 10.5109 0.00048990 2024-09-20 17:37:50.638084 32 +56 2.350518 10.4910 0.00048954 2024-09-20 20:45:52.043190 33 +57 2.337321 10.3535 0.00048917 2024-09-20 23:53:53.502275 34 +58 2.336833 10.3484 0.00048880 2024-09-21 03:01:56.201618 35 +59 2.340571 10.3872 0.00048842 2024-09-21 06:09:58.160088 36 +60 2.333434 10.3133 0.00048803 2024-09-21 09:17:59.444376 37 +61 2.341987 10.4019 0.00048764 2024-09-21 12:26:00.812889 38 +62 2.341618 10.3980 0.00048724 2024-09-21 15:34:02.519587 39 +63 2.332180 10.3004 0.00048684 2024-09-21 18:42:03.472799 40 +64 2.328448 10.2620 0.00048642 2024-09-21 21:50:05.128249 41 +65 2.327097 10.2481 0.00048601 2024-09-22 00:58:06.506137 42 +66 2.332929 10.3081 0.00048559 2024-09-22 04:06:08.631787 43 +67 2.322295 10.1991 0.00048516 2024-09-22 07:14:10.461836 44 +68 2.320209 10.1778 0.00048472 2024-09-22 10:22:12.001861 45 +69 2.319038 10.1659 0.00048428 2024-09-22 13:30:14.568250 46 +70 2.317603 10.1513 0.00048383 2024-09-22 16:38:16.652428 47 +71 2.324875 10.2254 0.00048338 2024-09-22 19:46:18.628767 48 +72 2.318921 10.1647 0.00048292 2024-09-22 22:54:21.367615 49 +73 2.316281 10.1379 0.00048246 2024-09-23 02:02:21.795595 50 +74 2.310889 10.0834 0.00048199 2024-09-23 05:10:23.118162 51 +75 2.321528 10.1912 0.00048151 2024-09-23 08:18:23.178982 52 +76 2.317483 10.1501 0.00048103 2024-09-23 11:26:24.660151 53 +77 2.311013 10.0846 0.00048054 2024-09-23 14:34:25.399919 54 +78 2.312722 10.1019 0.00048004 2024-09-23 17:42:26.945421 55 +79 2.301967 9.9938 0.00047954 2024-09-23 20:50:48.476432 56 +80 2.305260 10.0268 0.00047904 2024-09-23 23:58:51.244134 57 +81 2.304329 10.0175 0.00047852 2024-09-24 03:06:49.744254 58 +82 2.307243 10.0467 0.00047801 2024-09-24 06:14:48.719305 59 +83 2.306288 10.0371 0.00047748 2024-09-24 09:22:46.891727 60 +84 2.299433 9.9685 0.00047695 2024-09-24 12:30:45.768111 61 +85 2.305211 10.0263 0.00047642 2024-09-24 15:38:44.826579 62 +86 2.298701 9.9612 0.00047587 2024-09-24 18:46:42.842461 63 +87 2.300703 9.9812 0.00047533 2024-09-24 21:54:40.264275 64 +88 2.298869 9.9629 0.00047477 2024-09-25 01:02:36.915440 65 +89 2.301904 9.9932 0.00047422 2024-09-25 04:10:34.388497 66 +90 2.291984 9.8945 0.00047365 2024-09-25 07:18:31.521904 67 +91 2.296954 9.9438 0.00047308 2024-09-25 10:26:28.529985 68 +92 2.290158 9.8765 0.00047250 2024-09-25 13:34:27.583946 69 +93 2.292169 9.8964 0.00047192 2024-09-25 16:42:25.878871 70 +94 2.282709 9.8032 0.00047133 2024-09-25 19:50:26.140240 71 +95 2.292399 9.8987 0.00047074 2024-09-25 22:58:25.614701 72 +96 2.289651 9.8715 0.00047014 2024-09-26 02:06:24.756666 73 +97 2.291699 9.8917 0.00046954 2024-09-26 05:14:23.411570 74 +98 2.291680 9.8915 0.00046893 2024-09-26 08:22:21.714347 75 +99 2.281809 9.7944 0.00046831 2024-09-26 11:30:20.165427 76 +100 2.280971 9.7862 0.00046769 2024-09-26 14:38:20.207117 77 +101 2.278608 9.7631 0.00046706 2024-09-26 17:46:19.294834 78 +102 2.287809 9.8533 0.00046643 2024-09-26 20:54:19.336010 79 +103 2.283577 9.8117 0.00046579 2024-09-27 00:02:17.927912 80 +104 2.268704 9.6669 0.00046515 2024-09-27 03:10:16.105128 81 +105 2.273969 9.7179 0.00046450 2024-09-27 06:18:14.193342 82 +106 2.277637 9.7536 0.00046385 2024-09-27 09:26:12.418163 83 +107 2.275033 9.7282 0.00046319 2024-09-27 12:34:09.538049 84 +108 2.273381 9.7122 0.00046252 2024-09-27 15:42:05.890163 85 +109 2.278353 9.7606 0.00046185 2024-09-27 18:50:02.436388 86 +110 2.277205 9.7494 0.00046118 2024-09-27 21:58:00.086943 87 +111 2.273684 9.7151 0.00046049 2024-09-28 01:05:57.364418 88 +112 2.270595 9.6852 0.00045981 2024-09-28 04:13:53.715732 89 +113 2.268522 9.6651 0.00045912 2024-09-28 07:21:50.057352 90 +114 2.276004 9.7377 0.00045842 2024-09-28 10:29:47.795725 91 +115 2.270058 9.6800 0.00045772 2024-09-28 13:37:44.690435 92 +116 2.265706 9.6379 0.00045701 2024-09-28 16:45:44.097449 93 +117 2.273728 9.7156 0.00045629 2024-09-28 19:53:43.270940 94 +118 2.274181 9.7200 0.00045558 2024-09-28 23:01:43.075332 95 +119 2.263156 9.6134 0.00045485 2024-09-29 02:09:42.328405 96 +120 2.261222 9.5948 0.00045412 2024-09-29 05:17:41.192517 97 +121 2.259776 9.5809 0.00045339 2024-09-29 08:25:39.497764 98 +122 2.263959 9.6211 0.00045265 2024-09-29 11:33:38.645199 99 +123 2.262193 9.6041 0.00045191 2024-09-29 14:41:36.527256 100 +124 2.257576 9.5599 0.00045116 2024-09-29 17:49:34.021742 101 +125 2.268986 9.6696 0.00045040 2024-09-29 20:57:32.620567 102 +126 2.264166 9.6231 0.00044964 2024-09-30 00:05:31.432102 103 +127 2.263108 9.6129 0.00044888 2024-09-30 03:13:30.956181 104 +128 2.258114 9.5650 0.00044811 2024-09-30 06:21:29.032811 105 +129 2.262823 9.6102 0.00044733 2024-09-30 09:29:26.344236 106 +130 2.256242 9.5471 0.00044655 2024-09-30 12:37:23.958587 107 +131 2.261125 9.5939 0.00044577 2024-09-30 15:45:22.366548 108 +132 2.263596 9.6176 0.00044498 2024-09-30 18:53:20.883489 109 +133 2.261941 9.6017 0.00044419 2024-09-30 22:01:17.510608 110 +134 2.260449 9.5874 0.00044339 2024-10-01 01:09:13.972485 111 +135 2.257278 9.5570 0.00044258 2024-10-01 04:17:12.203900 112 +136 2.252664 9.5130 0.00044177 2024-10-01 07:25:09.883290 113 +137 2.257007 9.5544 0.00044096 2024-10-01 10:33:07.345856 114 +138 2.256917 9.5536 0.00044014 2024-10-01 13:41:06.673120 115 +139 2.258553 9.5692 0.00043932 2024-10-01 16:49:06.735706 116 +140 2.251988 9.5066 0.00043849 2024-10-01 19:57:02.697724 117 +141 2.256190 9.5467 0.00043766 2024-10-01 23:05:01.982056 118 +142 2.251180 9.4989 0.00043682 2024-10-02 02:12:59.932524 119 +143 2.256548 9.5501 0.00043598 2024-10-02 05:20:57.405431 120 +144 2.245131 9.4416 0.00043513 2024-10-02 08:28:53.948451 121 +145 2.243460 9.4259 0.00043428 2024-10-02 11:36:50.295576 122 +146 2.246319 9.4529 0.00043342 2024-10-02 14:44:47.335961 123 +147 2.246829 9.4577 0.00043256 2024-10-02 17:52:44.483577 124 +148 2.249561 9.4836 0.00043170 2024-10-02 21:00:41.594506 125 +149 2.244434 9.4351 0.00043083 2024-10-03 00:08:37.520585 126 +150 2.242982 9.4214 0.00042995 2024-10-03 03:16:32.838056 127 +151 2.244518 9.4359 0.00042908 2024-10-03 06:24:26.802207 128 +152 2.251671 9.5036 0.00042819 2024-10-03 09:32:22.021171 129 +153 2.239019 9.3841 0.00042731 2024-10-03 12:40:17.242370 130 +154 2.238129 9.3758 0.00042641 2024-10-03 15:48:13.106382 131 +155 2.244846 9.4390 0.00042552 2024-10-03 18:56:09.273060 132 +156 2.241010 9.4028 0.00042462 2024-10-03 22:04:05.224339 133 +157 2.248031 9.4691 0.00042371 2024-10-04 01:12:02.126320 134 +158 2.245226 9.4425 0.00042280 2024-10-04 04:19:58.517114 135 +159 2.236211 9.3578 0.00042189 2024-10-04 07:27:54.596769 136 +160 2.243821 9.4293 0.00042097 2024-10-04 10:35:49.371722 137 +161 2.242890 9.4205 0.00042005 2024-10-04 13:43:44.571288 138 +162 2.255016 9.5354 0.00041912 2024-10-04 16:51:40.076761 139 +163 2.256285 9.5476 0.00041819 2024-10-04 19:59:36.495424 140 +164 2.246994 9.4593 0.00041726 2024-10-04 23:07:32.541324 141 +165 2.258005 9.5640 0.00041632 2024-10-05 02:15:28.180029 142 +166 2.264217 9.6236 0.00041538 2024-10-05 05:23:24.230274 143 +167 2.283236 9.8084 0.00041443 2024-10-05 08:31:19.947371 144 +168 2.292977 9.9044 0.00041348 2024-10-05 11:39:16.158644 145 +169 2.298107 9.9553 0.00041253 2024-10-05 14:47:13.939899 146 +170 2.295093 9.9254 0.00041157 2024-10-05 17:55:10.553725 147 +171 2.293226 9.9069 0.00041061 2024-10-05 21:03:06.484983 148 +172 2.309302 10.0674 0.00040964 2024-10-06 00:11:01.883957 149 +173 2.293905 9.9136 0.00040867 2024-10-06 03:18:56.545139 150 +174 2.303082 10.0050 0.00040769 2024-10-06 06:26:53.960006 151 +175 2.297317 9.9475 0.00040672 2024-10-06 09:34:50.991085 152 +176 2.313932 10.1141 0.00040573 2024-10-06 12:42:48.668089 153 +177 2.284660 9.8223 0.00040475 2024-10-06 15:50:44.931599 154 +178 2.299742 9.9716 0.00040376 2024-10-06 18:58:41.233234 155 +179 2.289250 9.8675 0.00040277 2024-10-06 22:06:37.603197 156 +180 2.268343 9.6634 0.00040177 2024-10-07 01:14:35.534462 157 +181 2.287410 9.8494 0.00040077 2024-10-07 04:22:32.852223 158 +182 2.337972 10.3602 0.00039977 2024-10-07 07:30:29.363150 159 +183 2.532663 12.5870 0.00039876 2024-10-07 10:38:24.957054 160 +184 2.361827 10.6103 0.00039775 2024-10-07 13:46:20.828643 161 +185 2.278654 9.7635 0.00039673 2024-10-07 16:54:18.432585 162 +186 2.272054 9.6993 0.00039571 2024-10-07 20:02:14.054714 163 +187 2.256340 9.5481 0.00039469 2024-10-07 23:10:10.596012 164 +188 2.249316 9.4813 0.00039367 2024-10-08 02:18:06.731705 165 +189 2.245209 9.4424 0.00039264 2024-10-08 05:26:03.221919 166 +190 2.253472 9.5207 0.00039161 2024-10-08 08:34:00.923779 167 +191 2.257031 9.5547 0.00039057 2024-10-08 11:41:57.138759 168 +192 2.241262 9.4052 0.00038953 2024-10-08 14:49:52.862716 169 +193 2.244301 9.4338 0.00038849 2024-10-08 17:57:48.198311 170 +194 2.240375 9.3969 0.00038745 2024-10-08 21:05:44.381534 171 +195 2.240864 9.4014 0.00038640 2024-10-09 00:13:42.092388 172 +196 2.229544 9.2956 0.00038535 2024-10-09 03:21:38.605843 173 +197 2.249870 9.4865 0.00038429 2024-10-09 06:29:35.383146 174 +198 2.243975 9.4307 0.00038323 2024-10-09 09:37:30.733423 175 +199 2.244301 9.4338 0.00038217 2024-10-09 12:45:27.527751 176 +200 2.242141 9.4135 0.00038111 2024-10-09 15:53:23.490265 177 +201 2.241176 9.4044 0.00038004 2024-10-09 19:01:19.533433 178 +202 2.242722 9.4189 0.00037897 2024-10-09 22:09:15.173339 179 +203 2.229698 9.2971 0.00037790 2024-10-10 01:17:11.000584 180 +204 2.238222 9.3766 0.00037682 2024-10-10 04:25:07.146229 181 +205 2.233217 9.3298 0.00037574 2024-10-10 07:33:03.580324 182 +206 2.230900 9.3082 0.00037466 2024-10-10 10:41:01.033876 183 +207 2.226750 9.2697 0.00037357 2024-10-10 13:48:57.710105 184 +208 2.231565 9.3144 0.00037248 2024-10-10 16:56:54.636884 185 +209 2.229007 9.2906 0.00037139 2024-10-10 20:04:52.192485 186 +210 2.229085 9.2914 0.00037030 2024-10-10 23:12:48.817715 187 +211 2.227306 9.2748 0.00036920 2024-10-11 02:20:46.083717 188 +212 2.225876 9.2616 0.00036811 2024-10-11 05:28:42.949091 189 +213 2.227067 9.2726 0.00036700 2024-10-11 08:36:39.951702 190 +214 2.233274 9.3304 0.00036590 2024-10-11 11:44:36.508731 191 +215 2.235631 9.3524 0.00036479 2024-10-11 14:52:33.312749 192 +216 2.223253 9.2373 0.00036368 2024-10-11 18:00:30.962619 193 +217 2.229853 9.2985 0.00036257 2024-10-11 21:08:29.563478 194 +218 2.228494 9.2859 0.00036146 2024-10-12 00:16:28.356990 195 +219 2.227029 9.2723 0.00036034 2024-10-12 03:24:26.040785 196 +220 2.232221 9.3205 0.00035922 2024-10-12 06:32:23.579517 197 +221 2.235590 9.3520 0.00035810 2024-10-12 09:40:18.735851 198 +222 2.227762 9.2791 0.00035697 2024-10-12 12:48:15.053741 199 +223 2.246086 9.4507 0.00035585 2024-10-12 15:56:11.849683 200 +224 2.235395 9.3502 0.00035472 2024-10-12 19:04:08.210279 201 +225 2.251375 9.5008 0.00035359 2024-10-12 22:12:04.448913 202 +226 2.240804 9.4009 0.00035245 2024-10-13 01:19:59.922510 203 +227 2.248315 9.4718 0.00035132 2024-10-13 04:27:56.024696 204 +228 2.233252 9.3302 0.00035018 2024-10-13 07:35:51.565662 205 +229 2.238808 9.3821 0.00034904 2024-10-13 10:43:47.303323 206 +230 2.235078 9.3472 0.00034790 2024-10-13 13:51:43.019341 207 +231 2.244279 9.4336 0.00034675 2024-10-13 16:59:39.047175 208 +232 2.240628 9.3992 0.00034560 2024-10-13 20:07:36.009892 209 +233 2.260626 9.5891 0.00034446 2024-10-13 23:15:33.053371 210 +234 2.250182 9.4895 0.00034331 2024-10-14 02:23:29.121453 211 +235 2.234285 9.3398 0.00034215 2024-10-14 05:31:24.913284 212 +236 2.250429 9.4918 0.00034100 2024-10-14 08:39:21.978448 213 +237 2.263376 9.6155 0.00033984 2024-10-14 11:47:19.859964 214 +238 2.267177 9.6521 0.00033868 2024-10-14 14:55:16.741165 215 +239 2.269803 9.6775 0.00033752 2024-10-14 18:03:13.119089 216 +240 2.270302 9.6823 0.00033636 2024-10-14 21:11:10.315272 217 +241 2.275382 9.7316 0.00033520 2024-10-15 00:19:07.322492 218 +242 2.279145 9.7683 0.00033403 2024-10-15 03:27:03.206595 219 +243 2.273844 9.7167 0.00033287 2024-10-15 06:35:00.553681 220 +244 2.275553 9.7333 0.00033170 2024-10-15 09:42:57.219830 221 +245 2.285927 9.8348 0.00033053 2024-10-15 12:50:54.154382 222 +246 2.289787 9.8728 0.00032935 2024-10-15 15:58:50.856688 223 +247 2.281703 9.7933 0.00032818 2024-10-15 19:06:46.560312 224 +248 2.305189 10.0261 0.00032701 2024-10-15 22:14:43.051158 225 +249 2.294965 9.9241 0.00032583 2024-10-16 01:22:38.789799 226 +250 2.290592 9.8808 0.00032465 2024-10-16 04:30:34.909332 227 +251 2.292510 9.8998 0.00032347 2024-10-16 07:38:31.789794 228 +252 2.310777 10.0823 0.00032229 2024-10-16 10:46:29.097084 229 +253 2.933819 18.7993 0.00032111 2024-10-16 13:54:25.987998 230 +254 2.307191 10.0462 0.00031992 2024-10-16 17:02:23.172302 231 +255 2.314025 10.1151 0.00031874 2024-10-16 20:10:19.849326 232 +256 2.299851 9.9727 0.00031755 2024-10-16 23:18:16.385530 233 +257 2.318522 10.1606 0.00031637 2024-10-17 02:26:12.346972 234 +258 2.655257 14.2286 0.00031518 2024-10-17 05:34:10.197475 235 +259 2.472963 11.8575 0.00031399 2024-10-17 08:42:06.645916 236 +260 2.471834 11.8442 0.00031280 2024-10-17 11:50:01.301988 237 +261 2.491824 12.0833 0.00031160 2024-10-17 14:57:56.810464 238 +262 2.558065 12.9108 0.00031041 2024-10-17 18:05:51.779506 239 +263 2.491412 12.0783 0.00030922 2024-10-17 21:13:48.114813 240 +264 2.359757 10.5884 0.00030802 2024-10-18 00:21:45.966848 241 +265 2.348394 10.4687 0.00030683 2024-10-18 03:29:40.519265 242 +266 2.346701 10.4510 0.00030563 2024-10-18 06:37:36.550909 243 +267 2.346625 10.4502 0.00030443 2024-10-18 09:45:32.396609 244 +268 2.336860 10.3487 0.00030323 2024-10-18 12:53:28.843248 245 +269 2.316585 10.1410 0.00030203 2024-10-18 16:01:25.048819 246 +270 2.371894 10.7177 0.00030083 2024-10-18 19:09:20.373205 247 +271 2.351020 10.4963 0.00029963 2024-10-18 22:17:15.723177 248 +272 2.339052 10.3714 0.00029843 2024-10-19 01:25:10.244933 249 +273 2.318221 10.1576 0.00029723 2024-10-19 04:33:05.875414 250 +274 2.306432 10.0385 0.00029602 2024-10-19 07:41:02.264757 251 +275 2.314016 10.1150 0.00029482 2024-10-19 10:48:57.990970 252 +276 2.303429 10.0084 0.00029362 2024-10-19 13:56:54.450982 253 +277 2.325887 10.2358 0.00029241 2024-10-19 17:04:52.256511 254 +278 2.304327 10.0174 0.00029121 2024-10-19 20:12:48.235431 255 +279 2.315929 10.1343 0.00029000 2024-10-19 23:20:45.817304 256 +280 2.314524 10.1201 0.00028879 2024-10-20 02:28:43.700201 257 +281 2.307699 10.0513 0.00028759 2024-10-20 05:36:40.457929 258 +282 2.322298 10.1991 0.00028638 2024-10-20 08:44:37.392195 259 +283 2.399612 11.0189 0.00028517 2024-10-20 11:52:33.615706 260 +284 2.320912 10.1850 0.00028397 2024-10-20 15:00:29.585275 261 +285 2.305599 10.0302 0.00028276 2024-10-20 18:08:26.323347 262 +286 2.309630 10.0707 0.00028155 2024-10-20 21:16:25.982959 263 +287 2.325084 10.2275 0.00028034 2024-10-21 00:24:27.025253 264 +288 2.331405 10.2924 0.00027913 2024-10-21 03:32:26.282280 265 +289 2.339160 10.3725 0.00027792 2024-10-21 06:40:24.890513 266 +290 2.304104 10.0152 0.00027672 2024-10-21 09:48:24.995918 267 +291 2.315834 10.1334 0.00027551 2024-10-21 12:56:24.242856 268 +292 2.303331 10.0075 0.00027430 2024-10-21 16:04:24.367714 269 +293 2.305482 10.0290 0.00027309 2024-10-21 19:12:25.229439 270 +294 2.302276 9.9969 0.00027188 2024-10-21 22:20:27.980932 271 +NEW RUN 2024-10-22-20-49-29 +{'load_model': 'out/L12-D768-x060-assholerwkv/rwkv-294.pth', 'wandb': 'tami4', 'proj_dir': 'out/L12-D768-x060-assholerwkv', 'random_seed': -1, 'train_type': '', 'data_file': 'data/pol_tokenized', 'data_type': 'binidx', 'vocab_size': 8000, 'ctx_len': 2048, 'epoch_steps': 2880, 'epoch_count': 584, 'epoch_begin': 295, 'epoch_save': 1, 'micro_bsz': 14, 'n_layer': 12, 'n_embd': 768, 'dim_att': 768, 'dim_ffn': 2688, 'pre_ffn': 0, 'head_qk': 0, 'tiny_att_dim': 0, 'tiny_att_layer': -999, 'lr_init': 0.0005, 'lr_final': 5e-05, 'warmup_steps': 10, 'beta1': 0.9, 'beta2': 0.99, 'adam_eps': 1e-24, 'grad_cp': 0, 'dropout': 0, 'weight_decay': 0.001, 'weight_decay_final': -1, 'my_pile_version': 1, 'my_pile_stage': 3, 'my_pile_shift': 0, 'my_pile_edecay': 0, 'layerwise_lr': 1, 'ds_bucket_mb': 2, 'my_sample_len': 0, 'my_ffn_shift': 1, 'my_att_shift': 1, 'head_size_a': 64, 'head_size_divisor': 8, 'my_pos_emb': 0, 'load_partial': 0, 'magic_prime': 23580569, 'my_qa_mask': 0, 'my_random_steps': 0, 'my_testing': 'x060', 'my_exit': 99999999, 'my_exit_tokens': 48293025696, 'logger': False, 'enable_checkpointing': False, 'default_root_dir': None, 'gradient_clip_val': 1.0, 'gradient_clip_algorithm': None, 'num_nodes': 1, 'num_processes': None, 'devices': '1', 'gpus': None, 'auto_select_gpus': None, 'tpu_cores': None, 'ipus': None, 'enable_progress_bar': True, 'overfit_batches': 0.0, 'track_grad_norm': -1, 'check_val_every_n_epoch': 100000000000000000000, 'fast_dev_run': False, 'accumulate_grad_batches': None, 'max_epochs': -1, 'min_epochs': None, 'max_steps': -1, 'min_steps': None, 'max_time': None, 'limit_train_batches': None, 'limit_val_batches': None, 'limit_test_batches': None, 'limit_predict_batches': None, 'val_check_interval': None, 'log_every_n_steps': 100000000000000000000, 'accelerator': 'gpu', 'strategy': 'deepspeed_stage_2', 'sync_batchnorm': False, 'precision': 'bf16', 'enable_model_summary': True, 'num_sanity_val_steps': 0, 'resume_from_checkpoint': None, 'profiler': None, 'benchmark': None, 'reload_dataloaders_every_n_epochs': 0, 'auto_lr_find': False, 'replace_sampler_ddp': False, 'detect_anomaly': False, 'auto_scale_batch_size': False, 'plugins': None, 'amp_backend': None, 'amp_level': None, 'move_metrics_to_cpu': False, 'multiple_trainloader_mode': 'max_size_cycle', 'inference_mode': True, 'my_timestamp': '2024-10-22-20-49-29', 'betas': (0.9, 0.99), 'real_bsz': 14, 'run_name': '8000 ctx2048 L12 D768', 'my_pile_prev_p': 293} +{'zero_allow_untested_optimizer': True, 'zero_optimization': {'stage': 2, 'contiguous_gradients': True, 'overlap_comm': True, 'allgather_partitions': True, 'reduce_scatter': True, 'allgather_bucket_size': 2000000, 'reduce_bucket_size': 2000000, 'sub_group_size': 1000000000000}, 'activation_checkpointing': {'partition_activations': False, 'cpu_checkpointing': False, 'contiguous_memory_optimization': False, 'synchronize_checkpoint_boundary': False}, 'aio': {'block_size': 1048576, 'queue_depth': 8, 'single_submit': False, 'overlap_events': True, 'thread_count': 1}, 'gradient_accumulation_steps': 1, 'train_micro_batch_size_per_gpu': 14, 'gradient_clipping': 1.0, 'bf16': {'enabled': True}} +295 3.578803 35.8306 0.00027067 2024-10-22 23:55:57.107798 0 +296 2.339808 10.3792 0.00026946 2024-10-23 03:01:27.770491 1 +297 2.308089 10.0552 0.00026826 2024-10-23 06:06:58.030172 2 +298 2.308162 10.0559 0.00026705 2024-10-23 09:12:28.926671 3 +299 2.938897 18.8950 0.00026584 2024-10-23 12:18:01.291276 4 +300 2.744217 15.5524 0.00026463 2024-10-23 15:23:32.227327 5 +301 2.577450 13.1635 0.00026343 2024-10-23 18:29:02.823738 6 +NEW RUN 2024-10-24-10-57-51 +{'load_model': 'out/L12-D768-x060-assholerwkv/rwkv-301.pth', 'wandb': 'tami4', 'proj_dir': 'out/L12-D768-x060-assholerwkv', 'random_seed': -1, 'train_type': '', 'data_file': 'data/pol_tokenized', 'data_type': 'binidx', 'vocab_size': 8000, 'ctx_len': 2048, 'epoch_steps': 2880, 'epoch_count': 584, 'epoch_begin': 302, 'epoch_save': 1, 'micro_bsz': 14, 'n_layer': 12, 'n_embd': 768, 'dim_att': 768, 'dim_ffn': 2688, 'pre_ffn': 0, 'head_qk': 0, 'tiny_att_dim': 0, 'tiny_att_layer': -999, 'lr_init': 0.0005, 'lr_final': 5e-05, 'warmup_steps': 10, 'beta1': 0.9, 'beta2': 0.99, 'adam_eps': 1e-24, 'grad_cp': 0, 'dropout': 0, 'weight_decay': 0.001, 'weight_decay_final': -1, 'my_pile_version': 1, 'my_pile_stage': 3, 'my_pile_shift': 0, 'my_pile_edecay': 0, 'layerwise_lr': 1, 'ds_bucket_mb': 2, 'my_sample_len': 0, 'my_ffn_shift': 1, 'my_att_shift': 1, 'head_size_a': 64, 'head_size_divisor': 8, 'my_pos_emb': 0, 'load_partial': 0, 'magic_prime': 23580569, 'my_qa_mask': 0, 'my_random_steps': 0, 'my_testing': 'x060', 'my_exit': 99999999, 'my_exit_tokens': 48293025696, 'logger': False, 'enable_checkpointing': False, 'default_root_dir': None, 'gradient_clip_val': 1.0, 'gradient_clip_algorithm': None, 'num_nodes': 1, 'num_processes': None, 'devices': '1', 'gpus': None, 'auto_select_gpus': None, 'tpu_cores': None, 'ipus': None, 'enable_progress_bar': True, 'overfit_batches': 0.0, 'track_grad_norm': -1, 'check_val_every_n_epoch': 100000000000000000000, 'fast_dev_run': False, 'accumulate_grad_batches': None, 'max_epochs': -1, 'min_epochs': None, 'max_steps': -1, 'min_steps': None, 'max_time': None, 'limit_train_batches': None, 'limit_val_batches': None, 'limit_test_batches': None, 'limit_predict_batches': None, 'val_check_interval': None, 'log_every_n_steps': 100000000000000000000, 'accelerator': 'gpu', 'strategy': 'deepspeed_stage_2', 'sync_batchnorm': False, 'precision': 'bf16', 'enable_model_summary': True, 'num_sanity_val_steps': 0, 'resume_from_checkpoint': None, 'profiler': None, 'benchmark': None, 'reload_dataloaders_every_n_epochs': 0, 'auto_lr_find': False, 'replace_sampler_ddp': False, 'detect_anomaly': False, 'auto_scale_batch_size': False, 'plugins': None, 'amp_backend': None, 'amp_level': None, 'move_metrics_to_cpu': False, 'multiple_trainloader_mode': 'max_size_cycle', 'inference_mode': True, 'my_timestamp': '2024-10-24-10-57-51', 'betas': (0.9, 0.99), 'real_bsz': 14, 'run_name': '8000 ctx2048 L12 D768', 'my_pile_prev_p': 300} +{'zero_allow_untested_optimizer': True, 'zero_optimization': {'stage': 2, 'contiguous_gradients': True, 'overlap_comm': True, 'allgather_partitions': True, 'reduce_scatter': True, 'allgather_bucket_size': 2000000, 'reduce_bucket_size': 2000000, 'sub_group_size': 1000000000000}, 'activation_checkpointing': {'partition_activations': False, 'cpu_checkpointing': False, 'contiguous_memory_optimization': False, 'synchronize_checkpoint_boundary': False}, 'aio': {'block_size': 1048576, 'queue_depth': 8, 'single_submit': False, 'overlap_events': True, 'thread_count': 1}, 'gradient_accumulation_steps': 1, 'train_micro_batch_size_per_gpu': 14, 'gradient_clipping': 1.0, 'bf16': {'enabled': True}} +NEW RUN 2024-10-24-13-30-12 +{'load_model': 'out/L12-D768-x060-assholerwkv/rwkv-301.pth', 'wandb': 'tami4', 'proj_dir': 'out/L12-D768-x060-assholerwkv', 'random_seed': -1, 'train_type': '', 'data_file': 'data/pol_tokenized', 'data_type': 'binidx', 'vocab_size': 8000, 'ctx_len': 2048, 'epoch_steps': 2880, 'epoch_count': 584, 'epoch_begin': 302, 'epoch_save': 1, 'micro_bsz': 14, 'n_layer': 12, 'n_embd': 768, 'dim_att': 768, 'dim_ffn': 2688, 'pre_ffn': 0, 'head_qk': 0, 'tiny_att_dim': 0, 'tiny_att_layer': -999, 'lr_init': 0.0005, 'lr_final': 5e-05, 'warmup_steps': 10, 'beta1': 0.9, 'beta2': 0.99, 'adam_eps': 1e-24, 'grad_cp': 0, 'dropout': 0, 'weight_decay': 0.001, 'weight_decay_final': -1, 'my_pile_version': 1, 'my_pile_stage': 3, 'my_pile_shift': 0, 'my_pile_edecay': 0, 'layerwise_lr': 1, 'ds_bucket_mb': 2, 'my_sample_len': 0, 'my_ffn_shift': 1, 'my_att_shift': 1, 'head_size_a': 64, 'head_size_divisor': 8, 'my_pos_emb': 0, 'load_partial': 0, 'magic_prime': 23580569, 'my_qa_mask': 0, 'my_random_steps': 0, 'my_testing': 'x060', 'my_exit': 99999999, 'my_exit_tokens': 48293025696, 'logger': False, 'enable_checkpointing': False, 'default_root_dir': None, 'gradient_clip_val': 1.0, 'gradient_clip_algorithm': None, 'num_nodes': 1, 'num_processes': None, 'devices': '1', 'gpus': None, 'auto_select_gpus': None, 'tpu_cores': None, 'ipus': None, 'enable_progress_bar': True, 'overfit_batches': 0.0, 'track_grad_norm': -1, 'check_val_every_n_epoch': 100000000000000000000, 'fast_dev_run': False, 'accumulate_grad_batches': None, 'max_epochs': -1, 'min_epochs': None, 'max_steps': -1, 'min_steps': None, 'max_time': None, 'limit_train_batches': None, 'limit_val_batches': None, 'limit_test_batches': None, 'limit_predict_batches': None, 'val_check_interval': None, 'log_every_n_steps': 100000000000000000000, 'accelerator': 'gpu', 'strategy': 'deepspeed_stage_2', 'sync_batchnorm': False, 'precision': 'bf16', 'enable_model_summary': True, 'num_sanity_val_steps': 0, 'resume_from_checkpoint': None, 'profiler': None, 'benchmark': None, 'reload_dataloaders_every_n_epochs': 0, 'auto_lr_find': False, 'replace_sampler_ddp': False, 'detect_anomaly': False, 'auto_scale_batch_size': False, 'plugins': None, 'amp_backend': None, 'amp_level': None, 'move_metrics_to_cpu': False, 'multiple_trainloader_mode': 'max_size_cycle', 'inference_mode': True, 'my_timestamp': '2024-10-24-13-30-12', 'betas': (0.9, 0.99), 'real_bsz': 14, 'run_name': '8000 ctx2048 L12 D768', 'my_pile_prev_p': 300} +{'zero_allow_untested_optimizer': True, 'zero_optimization': {'stage': 2, 'contiguous_gradients': True, 'overlap_comm': True, 'allgather_partitions': True, 'reduce_scatter': True, 'allgather_bucket_size': 2000000, 'reduce_bucket_size': 2000000, 'sub_group_size': 1000000000000}, 'activation_checkpointing': {'partition_activations': False, 'cpu_checkpointing': False, 'contiguous_memory_optimization': False, 'synchronize_checkpoint_boundary': False}, 'aio': {'block_size': 1048576, 'queue_depth': 8, 'single_submit': False, 'overlap_events': True, 'thread_count': 1}, 'gradient_accumulation_steps': 1, 'train_micro_batch_size_per_gpu': 14, 'gradient_clipping': 1.0, 'bf16': {'enabled': True}} +NEW RUN 2024-10-24-14-29-15 +{'load_model': 'out/L12-D768-x060-assholerwkv/rwkv-300.pth', 'wandb': 'tami4', 'proj_dir': 'out/L12-D768-x060-assholerwkv', 'random_seed': -1, 'train_type': '', 'data_file': 'data/pol_tokenized', 'data_type': 'binidx', 'vocab_size': 8000, 'ctx_len': 2048, 'epoch_steps': 2880, 'epoch_count': 584, 'epoch_begin': 301, 'epoch_save': 1, 'micro_bsz': 14, 'n_layer': 12, 'n_embd': 768, 'dim_att': 768, 'dim_ffn': 2688, 'pre_ffn': 0, 'head_qk': 0, 'tiny_att_dim': 0, 'tiny_att_layer': -999, 'lr_init': 0.0005, 'lr_final': 5e-05, 'warmup_steps': 10, 'beta1': 0.9, 'beta2': 0.99, 'adam_eps': 1e-24, 'grad_cp': 0, 'dropout': 0, 'weight_decay': 0.001, 'weight_decay_final': -1, 'my_pile_version': 1, 'my_pile_stage': 3, 'my_pile_shift': 0, 'my_pile_edecay': 0, 'layerwise_lr': 1, 'ds_bucket_mb': 2, 'my_sample_len': 0, 'my_ffn_shift': 1, 'my_att_shift': 1, 'head_size_a': 64, 'head_size_divisor': 8, 'my_pos_emb': 0, 'load_partial': 0, 'magic_prime': 23580569, 'my_qa_mask': 0, 'my_random_steps': 0, 'my_testing': 'x060', 'my_exit': 99999999, 'my_exit_tokens': 48293025696, 'logger': False, 'enable_checkpointing': False, 'default_root_dir': None, 'gradient_clip_val': 1.0, 'gradient_clip_algorithm': None, 'num_nodes': 1, 'num_processes': None, 'devices': '1', 'gpus': None, 'auto_select_gpus': None, 'tpu_cores': None, 'ipus': None, 'enable_progress_bar': True, 'overfit_batches': 0.0, 'track_grad_norm': -1, 'check_val_every_n_epoch': 100000000000000000000, 'fast_dev_run': False, 'accumulate_grad_batches': None, 'max_epochs': -1, 'min_epochs': None, 'max_steps': -1, 'min_steps': None, 'max_time': None, 'limit_train_batches': None, 'limit_val_batches': None, 'limit_test_batches': None, 'limit_predict_batches': None, 'val_check_interval': None, 'log_every_n_steps': 100000000000000000000, 'accelerator': 'gpu', 'strategy': 'deepspeed_stage_2', 'sync_batchnorm': False, 'precision': 'bf16', 'enable_model_summary': True, 'num_sanity_val_steps': 0, 'resume_from_checkpoint': None, 'profiler': None, 'benchmark': None, 'reload_dataloaders_every_n_epochs': 0, 'auto_lr_find': False, 'replace_sampler_ddp': False, 'detect_anomaly': False, 'auto_scale_batch_size': False, 'plugins': None, 'amp_backend': None, 'amp_level': None, 'move_metrics_to_cpu': False, 'multiple_trainloader_mode': 'max_size_cycle', 'inference_mode': True, 'my_timestamp': '2024-10-24-14-29-15', 'betas': (0.9, 0.99), 'real_bsz': 14, 'run_name': '8000 ctx2048 L12 D768', 'my_pile_prev_p': 299} +{'zero_allow_untested_optimizer': True, 'zero_optimization': {'stage': 2, 'contiguous_gradients': True, 'overlap_comm': True, 'allgather_partitions': True, 'reduce_scatter': True, 'allgather_bucket_size': 2000000, 'reduce_bucket_size': 2000000, 'sub_group_size': 1000000000000}, 'activation_checkpointing': {'partition_activations': False, 'cpu_checkpointing': False, 'contiguous_memory_optimization': False, 'synchronize_checkpoint_boundary': False}, 'aio': {'block_size': 1048576, 'queue_depth': 8, 'single_submit': False, 'overlap_events': True, 'thread_count': 1}, 'gradient_accumulation_steps': 1, 'train_micro_batch_size_per_gpu': 14, 'gradient_clipping': 1.0, 'bf16': {'enabled': True}} +NEW RUN 2024-10-24-14-48-42 +{'load_model': 'out/L12-D768-x060-assholerwkv/rwkv-212.pth', 'wandb': 'tami4', 'proj_dir': 'out/L12-D768-x060-assholerwkv', 'random_seed': -1, 'train_type': '', 'data_file': 'data/pol_tokenized', 'data_type': 'binidx', 'vocab_size': 8000, 'ctx_len': 2048, 'epoch_steps': 2880, 'epoch_count': 584, 'epoch_begin': 213, 'epoch_save': 1, 'micro_bsz': 14, 'n_layer': 12, 'n_embd': 768, 'dim_att': 768, 'dim_ffn': 2688, 'pre_ffn': 0, 'head_qk': 0, 'tiny_att_dim': 0, 'tiny_att_layer': -999, 'lr_init': 0.0005, 'lr_final': 3e-05, 'warmup_steps': 10, 'beta1': 0.9, 'beta2': 0.99, 'adam_eps': 1e-24, 'grad_cp': 0, 'dropout': 0, 'weight_decay': 0.1, 'weight_decay_final': -1, 'my_pile_version': 1, 'my_pile_stage': 3, 'my_pile_shift': 0, 'my_pile_edecay': 0, 'layerwise_lr': 1, 'ds_bucket_mb': 2, 'my_sample_len': 0, 'my_ffn_shift': 1, 'my_att_shift': 1, 'head_size_a': 64, 'head_size_divisor': 8, 'my_pos_emb': 0, 'load_partial': 0, 'magic_prime': 23580569, 'my_qa_mask': 0, 'my_random_steps': 0, 'my_testing': 'x060', 'my_exit': 99999999, 'my_exit_tokens': 48293025696, 'logger': False, 'enable_checkpointing': False, 'default_root_dir': None, 'gradient_clip_val': 1.0, 'gradient_clip_algorithm': None, 'num_nodes': 1, 'num_processes': None, 'devices': '1', 'gpus': None, 'auto_select_gpus': None, 'tpu_cores': None, 'ipus': None, 'enable_progress_bar': True, 'overfit_batches': 0.0, 'track_grad_norm': -1, 'check_val_every_n_epoch': 100000000000000000000, 'fast_dev_run': False, 'accumulate_grad_batches': None, 'max_epochs': -1, 'min_epochs': None, 'max_steps': -1, 'min_steps': None, 'max_time': None, 'limit_train_batches': None, 'limit_val_batches': None, 'limit_test_batches': None, 'limit_predict_batches': None, 'val_check_interval': None, 'log_every_n_steps': 100000000000000000000, 'accelerator': 'gpu', 'strategy': 'deepspeed_stage_2', 'sync_batchnorm': False, 'precision': 'bf16', 'enable_model_summary': True, 'num_sanity_val_steps': 0, 'resume_from_checkpoint': None, 'profiler': None, 'benchmark': None, 'reload_dataloaders_every_n_epochs': 0, 'auto_lr_find': False, 'replace_sampler_ddp': False, 'detect_anomaly': False, 'auto_scale_batch_size': False, 'plugins': None, 'amp_backend': None, 'amp_level': None, 'move_metrics_to_cpu': False, 'multiple_trainloader_mode': 'max_size_cycle', 'inference_mode': True, 'my_timestamp': '2024-10-24-14-48-42', 'betas': (0.9, 0.99), 'real_bsz': 14, 'run_name': '8000 ctx2048 L12 D768', 'my_pile_prev_p': 211} +{'zero_allow_untested_optimizer': True, 'zero_optimization': {'stage': 2, 'contiguous_gradients': True, 'overlap_comm': True, 'allgather_partitions': True, 'reduce_scatter': True, 'allgather_bucket_size': 2000000, 'reduce_bucket_size': 2000000, 'sub_group_size': 1000000000000}, 'activation_checkpointing': {'partition_activations': False, 'cpu_checkpointing': False, 'contiguous_memory_optimization': False, 'synchronize_checkpoint_boundary': False}, 'aio': {'block_size': 1048576, 'queue_depth': 8, 'single_submit': False, 'overlap_events': True, 'thread_count': 1}, 'gradient_accumulation_steps': 1, 'train_micro_batch_size_per_gpu': 14, 'gradient_clipping': 1.0, 'bf16': {'enabled': True}} +213 3.394097 29.7877 0.00036109 2024-10-24 17:54:24.703844 0 +214 2.235701 9.3530 0.00035994 2024-10-24 20:59:55.160681 1 +215 2.253263 9.5187 0.00035878 2024-10-25 00:05:27.031896 2 +216 2.243639 9.4276 0.00035762 2024-10-25 03:10:57.019360 3 +217 2.251221 9.4993 0.00035646 2024-10-25 06:16:27.408996 4 +218 2.253570 9.5217 0.00035530 2024-10-25 09:21:58.298274 5 +219 2.263175 9.6136 0.00035413 2024-10-25 12:27:30.281400 6 +220 2.271484 9.6938 0.00035296 2024-10-25 15:33:02.020171 7 +221 2.280767 9.7842 0.00035179 2024-10-25 18:38:33.741270 8 +222 2.287451 9.8498 0.00035062 2024-10-25 21:44:05.085571 9 +223 2.367093 10.6663 0.00034944 2024-10-26 00:49:36.007077 10 +224 2.524113 12.4798 0.00034826 2024-10-26 03:55:07.377851 11 +225 2.860170 17.4645 0.00034708 2024-10-26 07:00:39.531928 12 +226 2.793397 16.3364 0.00034589 2024-10-26 10:06:12.480271 13 +NEW RUN 2024-10-26-10-16-42 +{'load_model': 'out/L12-D768-x060-assholerwkv/rwkv-212.pth', 'wandb': 'tami4', 'proj_dir': 'out/L12-D768-x060-assholerwkv', 'random_seed': -1, 'train_type': '', 'data_file': 'data/pol_tokenized', 'data_type': 'binidx', 'vocab_size': 8000, 'ctx_len': 2048, 'epoch_steps': 630, 'epoch_count': 584, 'epoch_begin': 213, 'epoch_save': 1, 'micro_bsz': 64, 'n_layer': 12, 'n_embd': 768, 'dim_att': 768, 'dim_ffn': 2688, 'pre_ffn': 0, 'head_qk': 0, 'tiny_att_dim': 0, 'tiny_att_layer': -999, 'lr_init': 0.0005, 'lr_final': 3e-05, 'warmup_steps': 10, 'beta1': 0.9, 'beta2': 0.99, 'adam_eps': 1e-24, 'grad_cp': 1, 'dropout': 0, 'weight_decay': 0.1, 'weight_decay_final': -1, 'my_pile_version': 1, 'my_pile_stage': 3, 'my_pile_shift': 0, 'my_pile_edecay': 0, 'layerwise_lr': 1, 'ds_bucket_mb': 2, 'my_sample_len': 0, 'my_ffn_shift': 1, 'my_att_shift': 1, 'head_size_a': 64, 'head_size_divisor': 8, 'my_pos_emb': 0, 'load_partial': 0, 'magic_prime': 23580569, 'my_qa_mask': 0, 'my_random_steps': 0, 'my_testing': 'x060', 'my_exit': 99999999, 'my_exit_tokens': 48293025696, 'logger': False, 'enable_checkpointing': False, 'default_root_dir': None, 'gradient_clip_val': 1.0, 'gradient_clip_algorithm': None, 'num_nodes': 1, 'num_processes': None, 'devices': '1', 'gpus': None, 'auto_select_gpus': None, 'tpu_cores': None, 'ipus': None, 'enable_progress_bar': True, 'overfit_batches': 0.0, 'track_grad_norm': -1, 'check_val_every_n_epoch': 100000000000000000000, 'fast_dev_run': False, 'accumulate_grad_batches': None, 'max_epochs': -1, 'min_epochs': None, 'max_steps': -1, 'min_steps': None, 'max_time': None, 'limit_train_batches': None, 'limit_val_batches': None, 'limit_test_batches': None, 'limit_predict_batches': None, 'val_check_interval': None, 'log_every_n_steps': 100000000000000000000, 'accelerator': 'gpu', 'strategy': 'deepspeed_stage_2', 'sync_batchnorm': False, 'precision': 'bf16', 'enable_model_summary': True, 'num_sanity_val_steps': 0, 'resume_from_checkpoint': None, 'profiler': None, 'benchmark': None, 'reload_dataloaders_every_n_epochs': 0, 'auto_lr_find': False, 'replace_sampler_ddp': False, 'detect_anomaly': False, 'auto_scale_batch_size': False, 'plugins': None, 'amp_backend': None, 'amp_level': None, 'move_metrics_to_cpu': False, 'multiple_trainloader_mode': 'max_size_cycle', 'inference_mode': True, 'my_timestamp': '2024-10-26-10-16-42', 'betas': (0.9, 0.99), 'real_bsz': 64, 'run_name': '8000 ctx2048 L12 D768', 'my_pile_prev_p': 211} +{'zero_allow_untested_optimizer': True, 'zero_optimization': {'stage': 2, 'contiguous_gradients': True, 'overlap_comm': True, 'allgather_partitions': True, 'reduce_scatter': True, 'allgather_bucket_size': 2000000, 'reduce_bucket_size': 2000000, 'sub_group_size': 1000000000000}, 'activation_checkpointing': {'partition_activations': False, 'cpu_checkpointing': False, 'contiguous_memory_optimization': False, 'synchronize_checkpoint_boundary': False}, 'aio': {'block_size': 1048576, 'queue_depth': 8, 'single_submit': False, 'overlap_events': True, 'thread_count': 1}, 'gradient_accumulation_steps': 1, 'train_micro_batch_size_per_gpu': 64, 'gradient_clipping': 1.0, 'bf16': {'enabled': True}} +NEW RUN 2024-10-26-10-19-20 +{'load_model': 'out/L12-D768-x060-assholerwkv/rwkv-212.pth', 'wandb': 'tami4', 'proj_dir': 'out/L12-D768-x060-assholerwkv', 'random_seed': -1, 'train_type': '', 'data_file': 'data/pol_tokenized', 'data_type': 'binidx', 'vocab_size': 8000, 'ctx_len': 2048, 'epoch_steps': 315, 'epoch_count': 584, 'epoch_begin': 213, 'epoch_save': 1, 'micro_bsz': 128, 'n_layer': 12, 'n_embd': 768, 'dim_att': 768, 'dim_ffn': 2688, 'pre_ffn': 0, 'head_qk': 0, 'tiny_att_dim': 0, 'tiny_att_layer': -999, 'lr_init': 0.0005, 'lr_final': 3e-05, 'warmup_steps': 10, 'beta1': 0.9, 'beta2': 0.99, 'adam_eps': 1e-24, 'grad_cp': 1, 'dropout': 0, 'weight_decay': 0.1, 'weight_decay_final': -1, 'my_pile_version': 1, 'my_pile_stage': 3, 'my_pile_shift': 0, 'my_pile_edecay': 0, 'layerwise_lr': 1, 'ds_bucket_mb': 2, 'my_sample_len': 0, 'my_ffn_shift': 1, 'my_att_shift': 1, 'head_size_a': 64, 'head_size_divisor': 8, 'my_pos_emb': 0, 'load_partial': 0, 'magic_prime': 23580569, 'my_qa_mask': 0, 'my_random_steps': 0, 'my_testing': 'x060', 'my_exit': 99999999, 'my_exit_tokens': 48293025696, 'logger': False, 'enable_checkpointing': False, 'default_root_dir': None, 'gradient_clip_val': 1.0, 'gradient_clip_algorithm': None, 'num_nodes': 1, 'num_processes': None, 'devices': '1', 'gpus': None, 'auto_select_gpus': None, 'tpu_cores': None, 'ipus': None, 'enable_progress_bar': True, 'overfit_batches': 0.0, 'track_grad_norm': -1, 'check_val_every_n_epoch': 100000000000000000000, 'fast_dev_run': False, 'accumulate_grad_batches': None, 'max_epochs': -1, 'min_epochs': None, 'max_steps': -1, 'min_steps': None, 'max_time': None, 'limit_train_batches': None, 'limit_val_batches': None, 'limit_test_batches': None, 'limit_predict_batches': None, 'val_check_interval': None, 'log_every_n_steps': 100000000000000000000, 'accelerator': 'gpu', 'strategy': 'deepspeed_stage_2', 'sync_batchnorm': False, 'precision': 'bf16', 'enable_model_summary': True, 'num_sanity_val_steps': 0, 'resume_from_checkpoint': None, 'profiler': None, 'benchmark': None, 'reload_dataloaders_every_n_epochs': 0, 'auto_lr_find': False, 'replace_sampler_ddp': False, 'detect_anomaly': False, 'auto_scale_batch_size': False, 'plugins': None, 'amp_backend': None, 'amp_level': None, 'move_metrics_to_cpu': False, 'multiple_trainloader_mode': 'max_size_cycle', 'inference_mode': True, 'my_timestamp': '2024-10-26-10-19-20', 'betas': (0.9, 0.99), 'real_bsz': 128, 'run_name': '8000 ctx2048 L12 D768', 'my_pile_prev_p': 211} +{'zero_allow_untested_optimizer': True, 'zero_optimization': {'stage': 2, 'contiguous_gradients': True, 'overlap_comm': True, 'allgather_partitions': True, 'reduce_scatter': True, 'allgather_bucket_size': 2000000, 'reduce_bucket_size': 2000000, 'sub_group_size': 1000000000000}, 'activation_checkpointing': {'partition_activations': False, 'cpu_checkpointing': False, 'contiguous_memory_optimization': False, 'synchronize_checkpoint_boundary': False}, 'aio': {'block_size': 1048576, 'queue_depth': 8, 'single_submit': False, 'overlap_events': True, 'thread_count': 1}, 'gradient_accumulation_steps': 1, 'train_micro_batch_size_per_gpu': 128, 'gradient_clipping': 1.0, 'bf16': {'enabled': True}} +NEW RUN 2024-10-26-10-20-21 +{'load_model': 'out/L12-D768-x060-assholerwkv/rwkv-212.pth', 'wandb': 'tami4', 'proj_dir': 'out/L12-D768-x060-assholerwkv', 'random_seed': -1, 'train_type': '', 'data_file': 'data/pol_tokenized', 'data_type': 'binidx', 'vocab_size': 8000, 'ctx_len': 2048, 'epoch_steps': 420, 'epoch_count': 584, 'epoch_begin': 213, 'epoch_save': 1, 'micro_bsz': 96, 'n_layer': 12, 'n_embd': 768, 'dim_att': 768, 'dim_ffn': 2688, 'pre_ffn': 0, 'head_qk': 0, 'tiny_att_dim': 0, 'tiny_att_layer': -999, 'lr_init': 0.0005, 'lr_final': 3e-05, 'warmup_steps': 10, 'beta1': 0.9, 'beta2': 0.99, 'adam_eps': 1e-24, 'grad_cp': 1, 'dropout': 0, 'weight_decay': 0.1, 'weight_decay_final': -1, 'my_pile_version': 1, 'my_pile_stage': 3, 'my_pile_shift': 0, 'my_pile_edecay': 0, 'layerwise_lr': 1, 'ds_bucket_mb': 2, 'my_sample_len': 0, 'my_ffn_shift': 1, 'my_att_shift': 1, 'head_size_a': 64, 'head_size_divisor': 8, 'my_pos_emb': 0, 'load_partial': 0, 'magic_prime': 23580569, 'my_qa_mask': 0, 'my_random_steps': 0, 'my_testing': 'x060', 'my_exit': 99999999, 'my_exit_tokens': 48293025696, 'logger': False, 'enable_checkpointing': False, 'default_root_dir': None, 'gradient_clip_val': 1.0, 'gradient_clip_algorithm': None, 'num_nodes': 1, 'num_processes': None, 'devices': '1', 'gpus': None, 'auto_select_gpus': None, 'tpu_cores': None, 'ipus': None, 'enable_progress_bar': True, 'overfit_batches': 0.0, 'track_grad_norm': -1, 'check_val_every_n_epoch': 100000000000000000000, 'fast_dev_run': False, 'accumulate_grad_batches': None, 'max_epochs': -1, 'min_epochs': None, 'max_steps': -1, 'min_steps': None, 'max_time': None, 'limit_train_batches': None, 'limit_val_batches': None, 'limit_test_batches': None, 'limit_predict_batches': None, 'val_check_interval': None, 'log_every_n_steps': 100000000000000000000, 'accelerator': 'gpu', 'strategy': 'deepspeed_stage_2', 'sync_batchnorm': False, 'precision': 'bf16', 'enable_model_summary': True, 'num_sanity_val_steps': 0, 'resume_from_checkpoint': None, 'profiler': None, 'benchmark': None, 'reload_dataloaders_every_n_epochs': 0, 'auto_lr_find': False, 'replace_sampler_ddp': False, 'detect_anomaly': False, 'auto_scale_batch_size': False, 'plugins': None, 'amp_backend': None, 'amp_level': None, 'move_metrics_to_cpu': False, 'multiple_trainloader_mode': 'max_size_cycle', 'inference_mode': True, 'my_timestamp': '2024-10-26-10-20-21', 'betas': (0.9, 0.99), 'real_bsz': 96, 'run_name': '8000 ctx2048 L12 D768', 'my_pile_prev_p': 211} +{'zero_allow_untested_optimizer': True, 'zero_optimization': {'stage': 2, 'contiguous_gradients': True, 'overlap_comm': True, 'allgather_partitions': True, 'reduce_scatter': True, 'allgather_bucket_size': 2000000, 'reduce_bucket_size': 2000000, 'sub_group_size': 1000000000000}, 'activation_checkpointing': {'partition_activations': False, 'cpu_checkpointing': False, 'contiguous_memory_optimization': False, 'synchronize_checkpoint_boundary': False}, 'aio': {'block_size': 1048576, 'queue_depth': 8, 'single_submit': False, 'overlap_events': True, 'thread_count': 1}, 'gradient_accumulation_steps': 1, 'train_micro_batch_size_per_gpu': 96, 'gradient_clipping': 1.0, 'bf16': {'enabled': True}} +NEW RUN 2024-10-26-10-23-44 +{'load_model': 'out/L12-D768-x060-assholerwkv/rwkv-212.pth', 'wandb': 'tami4', 'proj_dir': 'out/L12-D768-x060-assholerwkv', 'random_seed': -1, 'train_type': '', 'data_file': 'data/pol_tokenized', 'data_type': 'binidx', 'vocab_size': 8000, 'ctx_len': 2048, 'epoch_steps': 360, 'epoch_count': 584, 'epoch_begin': 213, 'epoch_save': 1, 'micro_bsz': 112, 'n_layer': 12, 'n_embd': 768, 'dim_att': 768, 'dim_ffn': 2688, 'pre_ffn': 0, 'head_qk': 0, 'tiny_att_dim': 0, 'tiny_att_layer': -999, 'lr_init': 0.0005, 'lr_final': 3e-05, 'warmup_steps': 10, 'beta1': 0.9, 'beta2': 0.99, 'adam_eps': 1e-24, 'grad_cp': 1, 'dropout': 0, 'weight_decay': 0.1, 'weight_decay_final': -1, 'my_pile_version': 1, 'my_pile_stage': 3, 'my_pile_shift': 0, 'my_pile_edecay': 0, 'layerwise_lr': 1, 'ds_bucket_mb': 2, 'my_sample_len': 0, 'my_ffn_shift': 1, 'my_att_shift': 1, 'head_size_a': 64, 'head_size_divisor': 8, 'my_pos_emb': 0, 'load_partial': 0, 'magic_prime': 23580569, 'my_qa_mask': 0, 'my_random_steps': 0, 'my_testing': 'x060', 'my_exit': 99999999, 'my_exit_tokens': 48293025696, 'logger': False, 'enable_checkpointing': False, 'default_root_dir': None, 'gradient_clip_val': 1.0, 'gradient_clip_algorithm': None, 'num_nodes': 1, 'num_processes': None, 'devices': '1', 'gpus': None, 'auto_select_gpus': None, 'tpu_cores': None, 'ipus': None, 'enable_progress_bar': True, 'overfit_batches': 0.0, 'track_grad_norm': -1, 'check_val_every_n_epoch': 100000000000000000000, 'fast_dev_run': False, 'accumulate_grad_batches': None, 'max_epochs': -1, 'min_epochs': None, 'max_steps': -1, 'min_steps': None, 'max_time': None, 'limit_train_batches': None, 'limit_val_batches': None, 'limit_test_batches': None, 'limit_predict_batches': None, 'val_check_interval': None, 'log_every_n_steps': 100000000000000000000, 'accelerator': 'gpu', 'strategy': 'deepspeed_stage_2', 'sync_batchnorm': False, 'precision': 'bf16', 'enable_model_summary': True, 'num_sanity_val_steps': 0, 'resume_from_checkpoint': None, 'profiler': None, 'benchmark': None, 'reload_dataloaders_every_n_epochs': 0, 'auto_lr_find': False, 'replace_sampler_ddp': False, 'detect_anomaly': False, 'auto_scale_batch_size': False, 'plugins': None, 'amp_backend': None, 'amp_level': None, 'move_metrics_to_cpu': False, 'multiple_trainloader_mode': 'max_size_cycle', 'inference_mode': True, 'my_timestamp': '2024-10-26-10-23-44', 'betas': (0.9, 0.99), 'real_bsz': 112, 'run_name': '8000 ctx2048 L12 D768', 'my_pile_prev_p': 211} +{'zero_allow_untested_optimizer': True, 'zero_optimization': {'stage': 2, 'contiguous_gradients': True, 'overlap_comm': True, 'allgather_partitions': True, 'reduce_scatter': True, 'allgather_bucket_size': 2000000, 'reduce_bucket_size': 2000000, 'sub_group_size': 1000000000000}, 'activation_checkpointing': {'partition_activations': False, 'cpu_checkpointing': False, 'contiguous_memory_optimization': False, 'synchronize_checkpoint_boundary': False}, 'aio': {'block_size': 1048576, 'queue_depth': 8, 'single_submit': False, 'overlap_events': True, 'thread_count': 1}, 'gradient_accumulation_steps': 1, 'train_micro_batch_size_per_gpu': 112, 'gradient_clipping': 1.0, 'bf16': {'enabled': True}} +NEW RUN 2024-10-26-10-27-12 +{'load_model': 'out/L12-D768-x060-assholerwkv/rwkv-212.pth', 'wandb': 'tami4', 'proj_dir': 'out/L12-D768-x060-assholerwkv', 'random_seed': -1, 'train_type': '', 'data_file': 'data/pol_tokenized', 'data_type': 'binidx', 'vocab_size': 8000, 'ctx_len': 2048, 'epoch_steps': 420, 'epoch_count': 584, 'epoch_begin': 213, 'epoch_save': 1, 'micro_bsz': 96, 'n_layer': 12, 'n_embd': 768, 'dim_att': 768, 'dim_ffn': 2688, 'pre_ffn': 0, 'head_qk': 0, 'tiny_att_dim': 0, 'tiny_att_layer': -999, 'lr_init': 0.0005, 'lr_final': 3e-05, 'warmup_steps': 10, 'beta1': 0.9, 'beta2': 0.99, 'adam_eps': 1e-24, 'grad_cp': 1, 'dropout': 0, 'weight_decay': 0.1, 'weight_decay_final': -1, 'my_pile_version': 1, 'my_pile_stage': 3, 'my_pile_shift': 0, 'my_pile_edecay': 0, 'layerwise_lr': 1, 'ds_bucket_mb': 2, 'my_sample_len': 0, 'my_ffn_shift': 1, 'my_att_shift': 1, 'head_size_a': 64, 'head_size_divisor': 8, 'my_pos_emb': 0, 'load_partial': 0, 'magic_prime': 23580569, 'my_qa_mask': 0, 'my_random_steps': 0, 'my_testing': 'x060', 'my_exit': 99999999, 'my_exit_tokens': 48293025696, 'logger': False, 'enable_checkpointing': False, 'default_root_dir': None, 'gradient_clip_val': 1.0, 'gradient_clip_algorithm': None, 'num_nodes': 1, 'num_processes': None, 'devices': '1', 'gpus': None, 'auto_select_gpus': None, 'tpu_cores': None, 'ipus': None, 'enable_progress_bar': True, 'overfit_batches': 0.0, 'track_grad_norm': -1, 'check_val_every_n_epoch': 100000000000000000000, 'fast_dev_run': False, 'accumulate_grad_batches': None, 'max_epochs': -1, 'min_epochs': None, 'max_steps': -1, 'min_steps': None, 'max_time': None, 'limit_train_batches': None, 'limit_val_batches': None, 'limit_test_batches': None, 'limit_predict_batches': None, 'val_check_interval': None, 'log_every_n_steps': 100000000000000000000, 'accelerator': 'gpu', 'strategy': 'deepspeed_stage_2', 'sync_batchnorm': False, 'precision': 'bf16', 'enable_model_summary': True, 'num_sanity_val_steps': 0, 'resume_from_checkpoint': None, 'profiler': None, 'benchmark': None, 'reload_dataloaders_every_n_epochs': 0, 'auto_lr_find': False, 'replace_sampler_ddp': False, 'detect_anomaly': False, 'auto_scale_batch_size': False, 'plugins': None, 'amp_backend': None, 'amp_level': None, 'move_metrics_to_cpu': False, 'multiple_trainloader_mode': 'max_size_cycle', 'inference_mode': True, 'my_timestamp': '2024-10-26-10-27-12', 'betas': (0.9, 0.99), 'real_bsz': 96, 'run_name': '8000 ctx2048 L12 D768', 'my_pile_prev_p': 211} +{'zero_allow_untested_optimizer': True, 'zero_optimization': {'stage': 2, 'contiguous_gradients': True, 'overlap_comm': True, 'allgather_partitions': True, 'reduce_scatter': True, 'allgather_bucket_size': 2000000, 'reduce_bucket_size': 2000000, 'sub_group_size': 1000000000000}, 'activation_checkpointing': {'partition_activations': False, 'cpu_checkpointing': False, 'contiguous_memory_optimization': False, 'synchronize_checkpoint_boundary': False}, 'aio': {'block_size': 1048576, 'queue_depth': 8, 'single_submit': False, 'overlap_events': True, 'thread_count': 1}, 'gradient_accumulation_steps': 1, 'train_micro_batch_size_per_gpu': 96, 'gradient_clipping': 1.0, 'bf16': {'enabled': True}} +213 13.515048 740475.8974 0.00036111 2024-10-26 14:19:01.945854 0 +214 2.213318 9.1460 0.00035996 2024-10-26 18:10:40.851287 1 +215 2.214118 9.1533 0.00035880 2024-10-26 22:02:18.911815 2 +216 2.202455 9.0472 0.00035764 2024-10-27 01:53:54.155745 3 +217 2.207199 9.0902 0.00035648 2024-10-27 04:45:32.059794 4 +218 2.202121 9.0442 0.00035532 2024-10-27 08:37:10.163215 5 +219 2.204353 9.0644 0.00035415 2024-10-27 12:28:48.181434 6 +220 2.204036 9.0615 0.00035298 2024-10-27 16:20:24.701426 7 +221 2.204520 9.0659 0.00035181 2024-10-27 20:12:02.222561 8 +222 2.194420 8.9748 0.00035063 2024-10-28 00:03:41.609510 9 +223 2.205376 9.0737 0.00034946 2024-10-28 03:55:21.370192 10 +224 2.194699 8.9773 0.00034828 2024-10-28 07:46:59.726209 11 +225 2.203720 9.0587 0.00034710 2024-10-28 11:38:36.849574 12 +226 2.198977 9.0158 0.00034591 2024-10-28 15:30:16.653440 13 +227 2.207887 9.0965 0.00034473 2024-10-28 19:21:53.762503 14 +228 2.194959 8.9796 0.00034354 2024-10-28 23:13:38.284699 15 +229 2.204185 9.0629 0.00034235 2024-10-29 03:05:18.669698 16 +230 2.198196 9.0087 0.00034115 2024-10-29 06:57:03.396312 17 +231 2.202214 9.0450 0.00033996 2024-10-29 10:48:39.646037 18 +232 2.190495 8.9396 0.00033876 2024-10-29 14:40:16.417960 19 +NEW RUN 2024-10-29-21-23-14 +{'load_model': 'out/L12-D768-x060-assholerwkv/rwkv-232.pth', 'wandb': 'tami4', 'proj_dir': 'out/L12-D768-x060-assholerwkv', 'random_seed': -1, 'train_type': '', 'data_file': 'data/pol_tokenized', 'data_type': 'binidx', 'vocab_size': 8000, 'ctx_len': 2048, 'epoch_steps': 420, 'epoch_count': 584, 'epoch_begin': 233, 'epoch_save': 1, 'micro_bsz': 96, 'n_layer': 12, 'n_embd': 768, 'dim_att': 768, 'dim_ffn': 2688, 'pre_ffn': 0, 'head_qk': 0, 'tiny_att_dim': 0, 'tiny_att_layer': -999, 'lr_init': 0.0005, 'lr_final': 3e-05, 'warmup_steps': 10, 'beta1': 0.9, 'beta2': 0.99, 'adam_eps': 1e-24, 'grad_cp': 1, 'dropout': 0, 'weight_decay': 0.1, 'weight_decay_final': -1, 'my_pile_version': 1, 'my_pile_stage': 3, 'my_pile_shift': 0, 'my_pile_edecay': 0, 'layerwise_lr': 1, 'ds_bucket_mb': 2, 'my_sample_len': 0, 'my_ffn_shift': 1, 'my_att_shift': 1, 'head_size_a': 64, 'head_size_divisor': 8, 'my_pos_emb': 0, 'load_partial': 0, 'magic_prime': 23580569, 'my_qa_mask': 0, 'my_random_steps': 0, 'my_testing': 'x060', 'my_exit': 99999999, 'my_exit_tokens': 48293025696, 'logger': False, 'enable_checkpointing': False, 'default_root_dir': None, 'gradient_clip_val': 1.0, 'gradient_clip_algorithm': None, 'num_nodes': 1, 'num_processes': None, 'devices': '1', 'gpus': None, 'auto_select_gpus': None, 'tpu_cores': None, 'ipus': None, 'enable_progress_bar': True, 'overfit_batches': 0.0, 'track_grad_norm': -1, 'check_val_every_n_epoch': 100000000000000000000, 'fast_dev_run': False, 'accumulate_grad_batches': None, 'max_epochs': -1, 'min_epochs': None, 'max_steps': -1, 'min_steps': None, 'max_time': None, 'limit_train_batches': None, 'limit_val_batches': None, 'limit_test_batches': None, 'limit_predict_batches': None, 'val_check_interval': None, 'log_every_n_steps': 100000000000000000000, 'accelerator': 'gpu', 'strategy': 'deepspeed_stage_2', 'sync_batchnorm': False, 'precision': 'bf16', 'enable_model_summary': True, 'num_sanity_val_steps': 0, 'resume_from_checkpoint': None, 'profiler': None, 'benchmark': None, 'reload_dataloaders_every_n_epochs': 0, 'auto_lr_find': False, 'replace_sampler_ddp': False, 'detect_anomaly': False, 'auto_scale_batch_size': False, 'plugins': None, 'amp_backend': None, 'amp_level': None, 'move_metrics_to_cpu': False, 'multiple_trainloader_mode': 'max_size_cycle', 'inference_mode': True, 'my_timestamp': '2024-10-29-21-23-14', 'betas': (0.9, 0.99), 'real_bsz': 96, 'run_name': '8000 ctx2048 L12 D768', 'my_pile_prev_p': 231} +{'zero_allow_untested_optimizer': True, 'zero_optimization': {'stage': 2, 'contiguous_gradients': True, 'overlap_comm': True, 'allgather_partitions': True, 'reduce_scatter': True, 'allgather_bucket_size': 2000000, 'reduce_bucket_size': 2000000, 'sub_group_size': 1000000000000}, 'activation_checkpointing': {'partition_activations': False, 'cpu_checkpointing': False, 'contiguous_memory_optimization': False, 'synchronize_checkpoint_boundary': False}, 'aio': {'block_size': 1048576, 'queue_depth': 8, 'single_submit': False, 'overlap_events': True, 'thread_count': 1}, 'gradient_accumulation_steps': 1, 'train_micro_batch_size_per_gpu': 96, 'gradient_clipping': 1.0, 'bf16': {'enabled': True}} +NEW RUN 2024-10-29-21-31-52 +{'load_model': 'out/L12-D768-x060-assholerwkv/rwkv-232.pth', 'wandb': 'tami4', 'proj_dir': 'out/L12-D768-x060-assholerwkv', 'random_seed': -1, 'train_type': '', 'data_file': 'data/pol_tokenized', 'data_type': 'binidx', 'vocab_size': 8000, 'ctx_len': 2048, 'epoch_steps': 420, 'epoch_count': 584, 'epoch_begin': 233, 'epoch_save': 1, 'micro_bsz': 96, 'n_layer': 12, 'n_embd': 768, 'dim_att': 768, 'dim_ffn': 2688, 'pre_ffn': 0, 'head_qk': 0, 'tiny_att_dim': 0, 'tiny_att_layer': -999, 'lr_init': 0.0005, 'lr_final': 3e-05, 'warmup_steps': 10, 'beta1': 0.9, 'beta2': 0.99, 'adam_eps': 1e-24, 'grad_cp': 1, 'dropout': 0, 'weight_decay': 0.1, 'weight_decay_final': -1, 'my_pile_version': 1, 'my_pile_stage': 3, 'my_pile_shift': 0, 'my_pile_edecay': 0, 'layerwise_lr': 1, 'ds_bucket_mb': 2, 'my_sample_len': 0, 'my_ffn_shift': 1, 'my_att_shift': 1, 'head_size_a': 64, 'head_size_divisor': 8, 'my_pos_emb': 0, 'load_partial': 0, 'magic_prime': 23580569, 'my_qa_mask': 0, 'my_random_steps': 0, 'my_testing': 'x060', 'my_exit': 99999999, 'my_exit_tokens': 48293025696, 'logger': False, 'enable_checkpointing': False, 'default_root_dir': None, 'gradient_clip_val': 1.0, 'gradient_clip_algorithm': None, 'num_nodes': 1, 'num_processes': None, 'devices': '1', 'gpus': None, 'auto_select_gpus': None, 'tpu_cores': None, 'ipus': None, 'enable_progress_bar': True, 'overfit_batches': 0.0, 'track_grad_norm': -1, 'check_val_every_n_epoch': 100000000000000000000, 'fast_dev_run': False, 'accumulate_grad_batches': None, 'max_epochs': -1, 'min_epochs': None, 'max_steps': -1, 'min_steps': None, 'max_time': None, 'limit_train_batches': None, 'limit_val_batches': None, 'limit_test_batches': None, 'limit_predict_batches': None, 'val_check_interval': None, 'log_every_n_steps': 100000000000000000000, 'accelerator': 'gpu', 'strategy': 'deepspeed_stage_2', 'sync_batchnorm': False, 'precision': 'bf16', 'enable_model_summary': True, 'num_sanity_val_steps': 0, 'resume_from_checkpoint': None, 'profiler': None, 'benchmark': None, 'reload_dataloaders_every_n_epochs': 0, 'auto_lr_find': False, 'replace_sampler_ddp': False, 'detect_anomaly': False, 'auto_scale_batch_size': False, 'plugins': None, 'amp_backend': None, 'amp_level': None, 'move_metrics_to_cpu': False, 'multiple_trainloader_mode': 'max_size_cycle', 'inference_mode': True, 'my_timestamp': '2024-10-29-21-31-52', 'betas': (0.9, 0.99), 'real_bsz': 96, 'run_name': '8000 ctx2048 L12 D768', 'my_pile_prev_p': 231} +{'zero_allow_untested_optimizer': True, 'zero_optimization': {'stage': 2, 'contiguous_gradients': True, 'overlap_comm': True, 'allgather_partitions': True, 'reduce_scatter': True, 'allgather_bucket_size': 2000000, 'reduce_bucket_size': 2000000, 'sub_group_size': 1000000000000}, 'activation_checkpointing': {'partition_activations': False, 'cpu_checkpointing': False, 'contiguous_memory_optimization': False, 'synchronize_checkpoint_boundary': False}, 'aio': {'block_size': 1048576, 'queue_depth': 8, 'single_submit': False, 'overlap_events': True, 'thread_count': 1}, 'gradient_accumulation_steps': 1, 'train_micro_batch_size_per_gpu': 96, 'gradient_clipping': 1.0, 'bf16': {'enabled': True}} +233 2.906789 18.2980 0.00033756 2024-10-30 01:24:20.477980 0 +234 2.201358 9.0373 0.00033636 2024-10-30 05:15:57.874663 1 +235 2.196633 8.9947 0.00033515 2024-10-30 09:07:33.701986 2 +236 2.205878 9.0782 0.00033395 2024-10-30 12:59:09.413546 3 +237 2.202883 9.0511 0.00033274 2024-10-30 16:50:45.779184 4 +238 2.200353 9.0282 0.00033153 2024-10-30 20:42:22.631939 5 +239 2.201321 9.0369 0.00033032 2024-10-31 00:33:58.580754 6 +240 2.200093 9.0259 0.00032911 2024-10-31 04:25:33.092241 7 +241 2.200186 9.0267 0.00032789 2024-10-31 08:17:10.286737 8 +242 2.202660 9.0491 0.00032667 2024-10-31 12:08:47.060163 9 +243 2.196243 8.9912 0.00032545 2024-10-31 16:00:23.147350 10 +244 2.197563 9.0030 0.00032423 2024-10-31 19:52:01.164958 11 +245 2.195536 8.9848 0.00032301 2024-10-31 23:43:42.007740 12 +246 2.200335 9.0280 0.00032179 2024-11-01 03:35:20.657219 13 +247 2.195610 8.9855 0.00032056 2024-11-01 07:26:57.935099 14 +248 2.204613 9.0667 0.00031933 2024-11-01 11:18:37.583000 15 +249 2.206380 9.0828 0.00031810 2024-11-01 15:10:13.833505 16 +250 2.205469 9.0745 0.00031687 2024-11-01 19:01:56.406835 17 +251 2.203720 9.0587 0.00031564 2024-11-01 22:53:32.887320 18 +252 2.202604 9.0485 0.00031441 2024-11-02 02:45:09.627798 19 +253 2.203330 9.0551 0.00031317 2024-11-02 06:36:45.501540 20 +254 2.207645 9.0943 0.00031194 2024-11-02 10:28:22.247007 21 +255 2.202995 9.0521 0.00031070 2024-11-02 14:19:58.140717 22 +256 2.201228 9.0361 0.00030946 2024-11-02 18:11:40.922186 23 +257 2.204725 9.0678 0.00030822 2024-11-02 22:03:18.243151 24 +258 2.201916 9.0423 0.00030698 2024-11-03 01:54:55.237919 25 +259 2.205562 9.0753 0.00030574 2024-11-03 05:46:32.216899 26 +260 2.203032 9.0524 0.00030449 2024-11-03 09:38:13.288623 27 +261 2.207738 9.0951 0.00030325 2024-11-03 13:29:51.681029 28 +262 2.198977 9.0158 0.00030200 2024-11-03 17:21:28.926161 29 +263 2.206901 9.0875 0.00030075 2024-11-03 21:13:04.269363 30 +264 2.204036 9.0615 0.00029951 2024-11-04 01:04:46.880276 31 +265 2.201953 9.0427 0.00029826 2024-11-04 04:56:23.619047 32 +266 2.212537 9.1389 0.00029701 2024-11-04 08:48:05.665388 33 +267 2.212481 9.1384 0.00029576 2024-11-04 12:39:45.436349 34 +268 2.218824 9.1965 0.00029450 2024-11-04 16:31:24.591365 35 +269 2.207626 9.0941 0.00029325 2024-11-04 20:23:03.638860 36 +270 2.214360 9.1555 0.00029200 2024-11-05 00:14:47.196669 37 +271 2.220145 9.2087 0.00029074 2024-11-05 04:06:23.729579 38 +272 2.217950 9.1885 0.00028949 2024-11-05 07:58:02.819919 39 +273 2.222824 9.2334 0.00028823 2024-11-05 11:49:39.800719 40 +274 2.222675 9.2320 0.00028697 2024-11-05 15:41:22.618411 41 +NEW RUN 2024-11-05-17-53-27 +{'load_model': 'out/L12-D768-x060-assholerwkv/rwkv-274.pth', 'wandb': 'tami4', 'proj_dir': 'out/L12-D768-x060-assholerwkv', 'random_seed': -1, 'train_type': '', 'data_file': 'data/pol_tokenized', 'data_type': 'binidx', 'vocab_size': 8000, 'ctx_len': 2048, 'epoch_steps': 420, 'epoch_count': 584, 'epoch_begin': 275, 'epoch_save': 1, 'micro_bsz': 96, 'n_layer': 12, 'n_embd': 768, 'dim_att': 768, 'dim_ffn': 2688, 'pre_ffn': 0, 'head_qk': 0, 'tiny_att_dim': 0, 'tiny_att_layer': -999, 'lr_init': 0.0005, 'lr_final': 3e-05, 'warmup_steps': 10, 'beta1': 0.9, 'beta2': 0.99, 'adam_eps': 1e-24, 'grad_cp': 1, 'dropout': 0, 'weight_decay': 0.1, 'weight_decay_final': -1, 'my_pile_version': 1, 'my_pile_stage': 3, 'my_pile_shift': 0, 'my_pile_edecay': 0, 'layerwise_lr': 1, 'ds_bucket_mb': 2, 'my_sample_len': 0, 'my_ffn_shift': 1, 'my_att_shift': 1, 'head_size_a': 64, 'head_size_divisor': 8, 'my_pos_emb': 0, 'load_partial': 0, 'magic_prime': 23580569, 'my_qa_mask': 0, 'my_random_steps': 0, 'my_testing': 'x060', 'my_exit': 99999999, 'my_exit_tokens': 48293025696, 'logger': False, 'enable_checkpointing': False, 'default_root_dir': None, 'gradient_clip_val': 1.0, 'gradient_clip_algorithm': None, 'num_nodes': 1, 'num_processes': None, 'devices': '1', 'gpus': None, 'auto_select_gpus': None, 'tpu_cores': None, 'ipus': None, 'enable_progress_bar': True, 'overfit_batches': 0.0, 'track_grad_norm': -1, 'check_val_every_n_epoch': 100000000000000000000, 'fast_dev_run': False, 'accumulate_grad_batches': None, 'max_epochs': -1, 'min_epochs': None, 'max_steps': -1, 'min_steps': None, 'max_time': None, 'limit_train_batches': None, 'limit_val_batches': None, 'limit_test_batches': None, 'limit_predict_batches': None, 'val_check_interval': None, 'log_every_n_steps': 100000000000000000000, 'accelerator': 'gpu', 'strategy': 'deepspeed_stage_2', 'sync_batchnorm': False, 'precision': 'bf16', 'enable_model_summary': True, 'num_sanity_val_steps': 0, 'resume_from_checkpoint': None, 'profiler': None, 'benchmark': None, 'reload_dataloaders_every_n_epochs': 0, 'auto_lr_find': False, 'replace_sampler_ddp': False, 'detect_anomaly': False, 'auto_scale_batch_size': False, 'plugins': None, 'amp_backend': None, 'amp_level': None, 'move_metrics_to_cpu': False, 'multiple_trainloader_mode': 'max_size_cycle', 'inference_mode': True, 'my_timestamp': '2024-11-05-17-53-27', 'betas': (0.9, 0.99), 'real_bsz': 96, 'run_name': '8000 ctx2048 L12 D768', 'my_pile_prev_p': 273} +{'zero_allow_untested_optimizer': True, 'zero_optimization': {'stage': 2, 'contiguous_gradients': True, 'overlap_comm': True, 'allgather_partitions': True, 'reduce_scatter': True, 'allgather_bucket_size': 2000000, 'reduce_bucket_size': 2000000, 'sub_group_size': 1000000000000}, 'activation_checkpointing': {'partition_activations': False, 'cpu_checkpointing': False, 'contiguous_memory_optimization': False, 'synchronize_checkpoint_boundary': False}, 'aio': {'block_size': 1048576, 'queue_depth': 8, 'single_submit': False, 'overlap_events': True, 'thread_count': 1}, 'gradient_accumulation_steps': 1, 'train_micro_batch_size_per_gpu': 96, 'gradient_clipping': 1.0, 'bf16': {'enabled': True}} +275 2.549628 12.8023 0.00028572 2024-11-05 21:45:16.770176 0 +276 2.234301 9.3399 0.00028446 2024-11-06 01:36:56.095555 1 +277 2.228237 9.2835 0.00028320 2024-11-06 05:28:35.781492 2 +278 2.239397 9.3877 0.00028194 2024-11-06 09:20:12.772117 3 +279 2.233538 9.3328 0.00028068 2024-11-06 13:11:50.934863 4 +280 2.248586 9.4743 0.00027942 2024-11-06 17:03:31.919337 5 +281 2.257366 9.5579 0.00027816 2024-11-06 20:55:11.348428 6 +282 2.251935 9.5061 0.00027690 2024-11-07 00:46:54.966993 7 +283 2.250893 9.4962 0.00027564 2024-11-07 04:38:33.389166 8 +284 2.248103 9.4698 0.00027438 2024-11-07 08:30:12.896129 9 +285 2.239286 9.3866 0.00027312 2024-11-07 12:21:57.292606 10 +286 2.243359 9.4249 0.00027186 2024-11-07 16:13:47.976532 11 +287 2.249144 9.4796 0.00027059 2024-11-07 20:05:33.759306 12 +288 2.246150 9.4513 0.00026933 2024-11-07 23:57:17.724819 13 +289 2.244141 9.4323 0.00026807 2024-11-08 03:48:55.771512 14 +290 2.254576 9.5313 0.00026681 2024-11-08 07:40:46.198204 15 +291 2.257682 9.5609 0.00026555 2024-11-08 11:32:23.429925 16 +292 2.240458 9.3976 0.00026428 2024-11-08 15:24:03.758469 17 +293 2.263318 9.6149 0.00026302 2024-11-08 19:15:49.507968 18 +294 2.340699 10.3885 0.00026176 2024-11-08 23:07:27.353130 19 +295 2.258966 9.5732 0.00026050 2024-11-09 02:59:04.660628 20 +296 2.252530 9.5118 0.00025923 2024-11-09 06:50:48.201730 21 +297 2.259710 9.5803 0.00025797 2024-11-09 10:42:26.050557 22 +298 2.259877 9.5819 0.00025671 2024-11-09 14:34:03.565565 23 +299 2.260714 9.5899 0.00025545 2024-11-09 18:25:42.383068 24 +NEW RUN 2024-11-09-19-03-08 +{'load_model': 'out/L12-D768-x060-assholerwkv/rwkv-299.pth', 'wandb': 'tami4', 'proj_dir': 'out/L12-D768-x060-assholerwkv', 'random_seed': -1, 'train_type': '', 'data_file': 'data/pol_tokenized', 'data_type': 'binidx', 'vocab_size': 8000, 'ctx_len': 2048, 'epoch_steps': 420, 'epoch_count': 584, 'epoch_begin': 300, 'epoch_save': 1, 'micro_bsz': 96, 'n_layer': 12, 'n_embd': 768, 'dim_att': 768, 'dim_ffn': 2688, 'pre_ffn': 0, 'head_qk': 0, 'tiny_att_dim': 0, 'tiny_att_layer': -999, 'lr_init': 0.0005, 'lr_final': 3e-05, 'warmup_steps': 10, 'beta1': 0.9, 'beta2': 0.99, 'adam_eps': 1e-24, 'grad_cp': 1, 'dropout': 0, 'weight_decay': 0.1, 'weight_decay_final': -1, 'my_pile_version': 1, 'my_pile_stage': 3, 'my_pile_shift': 0, 'my_pile_edecay': 0, 'layerwise_lr': 1, 'ds_bucket_mb': 2, 'my_sample_len': 0, 'my_ffn_shift': 1, 'my_att_shift': 1, 'head_size_a': 64, 'head_size_divisor': 8, 'my_pos_emb': 0, 'load_partial': 0, 'magic_prime': 23580569, 'my_qa_mask': 0, 'my_random_steps': 0, 'my_testing': 'x060', 'my_exit': 99999999, 'my_exit_tokens': 48293025696, 'logger': False, 'enable_checkpointing': False, 'default_root_dir': None, 'gradient_clip_val': 1.0, 'gradient_clip_algorithm': None, 'num_nodes': 1, 'num_processes': None, 'devices': '1', 'gpus': None, 'auto_select_gpus': None, 'tpu_cores': None, 'ipus': None, 'enable_progress_bar': True, 'overfit_batches': 0.0, 'track_grad_norm': -1, 'check_val_every_n_epoch': 100000000000000000000, 'fast_dev_run': False, 'accumulate_grad_batches': None, 'max_epochs': -1, 'min_epochs': None, 'max_steps': -1, 'min_steps': None, 'max_time': None, 'limit_train_batches': None, 'limit_val_batches': None, 'limit_test_batches': None, 'limit_predict_batches': None, 'val_check_interval': None, 'log_every_n_steps': 100000000000000000000, 'accelerator': 'gpu', 'strategy': 'deepspeed_stage_2', 'sync_batchnorm': False, 'precision': 'bf16', 'enable_model_summary': True, 'num_sanity_val_steps': 0, 'resume_from_checkpoint': None, 'profiler': None, 'benchmark': None, 'reload_dataloaders_every_n_epochs': 0, 'auto_lr_find': False, 'replace_sampler_ddp': False, 'detect_anomaly': False, 'auto_scale_batch_size': False, 'plugins': None, 'amp_backend': None, 'amp_level': None, 'move_metrics_to_cpu': False, 'multiple_trainloader_mode': 'max_size_cycle', 'inference_mode': True, 'my_timestamp': '2024-11-09-19-03-08', 'betas': (0.9, 0.99), 'real_bsz': 96, 'run_name': '8000 ctx2048 L12 D768', 'my_pile_prev_p': 298} +{'zero_allow_untested_optimizer': True, 'zero_optimization': {'stage': 2, 'contiguous_gradients': True, 'overlap_comm': True, 'allgather_partitions': True, 'reduce_scatter': True, 'allgather_bucket_size': 2000000, 'reduce_bucket_size': 2000000, 'sub_group_size': 1000000000000}, 'activation_checkpointing': {'partition_activations': False, 'cpu_checkpointing': False, 'contiguous_memory_optimization': False, 'synchronize_checkpoint_boundary': False}, 'aio': {'block_size': 1048576, 'queue_depth': 8, 'single_submit': False, 'overlap_events': True, 'thread_count': 1}, 'gradient_accumulation_steps': 1, 'train_micro_batch_size_per_gpu': 96, 'gradient_clipping': 1.0, 'bf16': {'enabled': True}} +NEW RUN 2024-11-09-20-19-35 +{'load_model': 'out/L12-D768-x060-assholerwkv/rwkv-262.pth', 'wandb': 'tami4', 'proj_dir': 'out/L12-D768-x060-assholerwkv', 'random_seed': -1, 'train_type': '', 'data_file': 'data/pol_tokenized', 'data_type': 'binidx', 'vocab_size': 8000, 'ctx_len': 2048, 'epoch_steps': 420, 'epoch_count': 584, 'epoch_begin': 263, 'epoch_save': 1, 'micro_bsz': 96, 'n_layer': 12, 'n_embd': 768, 'dim_att': 768, 'dim_ffn': 2688, 'pre_ffn': 0, 'head_qk': 0, 'tiny_att_dim': 0, 'tiny_att_layer': -999, 'lr_init': 0.0005, 'lr_final': 5e-06, 'warmup_steps': 20, 'beta1': 0.9, 'beta2': 0.95, 'adam_eps': 1e-18, 'grad_cp': 1, 'dropout': 0, 'weight_decay': 0.1, 'weight_decay_final': -1, 'my_pile_version': 1, 'my_pile_stage': 3, 'my_pile_shift': 0, 'my_pile_edecay': 0, 'layerwise_lr': 1, 'ds_bucket_mb': 2, 'my_sample_len': 0, 'my_ffn_shift': 1, 'my_att_shift': 1, 'head_size_a': 64, 'head_size_divisor': 8, 'my_pos_emb': 0, 'load_partial': 0, 'magic_prime': 23580569, 'my_qa_mask': 0, 'my_random_steps': 0, 'my_testing': 'x060', 'my_exit': 99999999, 'my_exit_tokens': 48293025696, 'logger': False, 'enable_checkpointing': False, 'default_root_dir': None, 'gradient_clip_val': 0.7, 'gradient_clip_algorithm': None, 'num_nodes': 1, 'num_processes': None, 'devices': '1', 'gpus': None, 'auto_select_gpus': None, 'tpu_cores': None, 'ipus': None, 'enable_progress_bar': True, 'overfit_batches': 0.0, 'track_grad_norm': -1, 'check_val_every_n_epoch': 100000000000000000000, 'fast_dev_run': False, 'accumulate_grad_batches': None, 'max_epochs': -1, 'min_epochs': None, 'max_steps': -1, 'min_steps': None, 'max_time': None, 'limit_train_batches': None, 'limit_val_batches': None, 'limit_test_batches': None, 'limit_predict_batches': None, 'val_check_interval': None, 'log_every_n_steps': 100000000000000000000, 'accelerator': 'gpu', 'strategy': 'deepspeed_stage_2', 'sync_batchnorm': False, 'precision': 'bf16', 'enable_model_summary': True, 'num_sanity_val_steps': 0, 'resume_from_checkpoint': None, 'profiler': None, 'benchmark': None, 'reload_dataloaders_every_n_epochs': 0, 'auto_lr_find': False, 'replace_sampler_ddp': False, 'detect_anomaly': False, 'auto_scale_batch_size': False, 'plugins': None, 'amp_backend': None, 'amp_level': None, 'move_metrics_to_cpu': False, 'multiple_trainloader_mode': 'max_size_cycle', 'inference_mode': True, 'grad_clip': 0.7, 'my_timestamp': '2024-11-09-20-19-35', 'betas': (0.9, 0.95), 'real_bsz': 96, 'run_name': '8000 ctx2048 L12 D768', 'my_pile_prev_p': 261} +{'zero_allow_untested_optimizer': True, 'zero_optimization': {'stage': 2, 'contiguous_gradients': True, 'overlap_comm': True, 'allgather_partitions': True, 'reduce_scatter': True, 'allgather_bucket_size': 2000000, 'reduce_bucket_size': 2000000, 'sub_group_size': 1000000000000}, 'activation_checkpointing': {'partition_activations': False, 'cpu_checkpointing': False, 'contiguous_memory_optimization': False, 'synchronize_checkpoint_boundary': False}, 'aio': {'block_size': 1048576, 'queue_depth': 8, 'single_submit': False, 'overlap_events': True, 'thread_count': 1}, 'gradient_accumulation_steps': 1, 'train_micro_batch_size_per_gpu': 96, 'gradient_clipping': 0.7, 'bf16': {'enabled': True}} +263 2.204948 9.0698 0.00029017 2024-11-10 00:11:30.686169 0 +264 2.202158 9.0445 0.00028886 2024-11-10 04:03:07.650761 1 +265 2.208222 9.0995 0.00028754 2024-11-10 07:54:41.438891 2 +266 2.212909 9.1423 0.00028623 2024-11-10 11:46:15.271041 3 +267 2.205618 9.0759 0.00028491 2024-11-10 15:37:51.216205 4 +268 2.206678 9.0855 0.00028359 2024-11-10 19:29:25.042269 5 +269 2.196652 8.9948 0.00028227 2024-11-10 23:20:58.857795 6 +270 2.199628 9.0217 0.00028095 2024-11-11 03:12:33.611865 7 +271 2.198251 9.0092 0.00027963 2024-11-11 07:04:10.683401 8 +272 2.198158 9.0084 0.00027831 2024-11-11 10:55:52.723286 9 +273 2.199851 9.0237 0.00027698 2024-11-11 14:47:27.624019 10 +274 2.190532 8.9400 0.00027566 2024-11-11 18:39:05.831055 11 +275 2.196317 8.9918 0.00027434 2024-11-11 22:30:50.740086 12 +276 2.196466 8.9932 0.00027301 2024-11-12 02:22:26.047908 13 +277 2.194196 8.9728 0.00027169 2024-11-12 06:14:02.486955 14 +278 2.195982 8.9888 0.00027036 2024-11-12 10:05:44.572027 15 +279 2.200446 9.0290 0.00026903 2024-11-12 13:57:22.251942 16 +280 2.199498 9.0205 0.00026771 2024-11-12 17:48:57.953231 17 +281 2.196298 8.9917 0.00026638 2024-11-12 21:40:33.925038 18 +282 2.204446 9.0652 0.00026505 2024-11-13 01:32:11.599200 19 +283 2.199795 9.0232 0.00026372 2024-11-13 05:23:50.712790 20 +284 2.198140 9.0082 0.00026240 2024-11-13 09:15:26.990706 21 +285 2.192485 8.9574 0.00026107 2024-11-13 13:07:07.330005 22 +286 2.201544 9.0390 0.00025974 2024-11-13 16:58:45.497932 23 +287 2.204074 9.0619 0.00025841 2024-11-13 20:50:24.352603 24 +288 2.195908 8.9882 0.00025708 2024-11-14 00:42:01.913385 25 +289 2.205171 9.0718 0.00025575 2024-11-14 04:33:38.777683 26 +290 2.199516 9.0206 0.00025442 2024-11-14 08:25:16.898193 27 +291 2.203013 9.0523 0.00025309 2024-11-14 12:16:53.603013 28 +292 2.197321 9.0009 0.00025176 2024-11-14 16:08:31.886469 29 +293 2.199107 9.0170 0.00025043 2024-11-14 20:00:09.415995 30 +294 2.195796 8.9872 0.00024910 2024-11-14 23:51:46.148352 31 +295 2.199405 9.0196 0.00024777 2024-11-15 03:43:32.241863 32 +296 2.190551 8.9401 0.00024644 2024-11-15 07:35:09.229673 33 +297 2.199442 9.0200 0.00024511 2024-11-15 11:26:50.587166 34 +298 2.196689 8.9952 0.00024378 2024-11-15 15:18:29.031349 35 +299 2.191871 8.9519 0.00024246 2024-11-15 19:10:07.649710 36 +300 2.200800 9.0322 0.00024113 2024-11-15 23:01:45.357632 37 +301 2.190811 8.9425 0.00023980 2024-11-16 02:53:22.006695 38 +302 2.200298 9.0277 0.00023847 2024-11-16 06:45:04.365308 39 +303 2.190755 8.9420 0.00023714 2024-11-16 10:36:42.148248 40 +304 2.200391 9.0285 0.00023582 2024-11-16 14:28:22.232832 41 +NEW RUN 2024-11-16-14-51-57 +{'load_model': 'out/L12-D768-x060-assholerwkv/rwkv-304.pth', 'wandb': 'tami4', 'proj_dir': 'out/L12-D768-x060-assholerwkv', 'random_seed': -1, 'train_type': '', 'data_file': 'data/pol_tokenized', 'data_type': 'binidx', 'vocab_size': 8000, 'ctx_len': 2048, 'epoch_steps': 420, 'epoch_count': 584, 'epoch_begin': 305, 'epoch_save': 1, 'micro_bsz': 96, 'n_layer': 12, 'n_embd': 768, 'dim_att': 768, 'dim_ffn': 2688, 'pre_ffn': 0, 'head_qk': 0, 'tiny_att_dim': 0, 'tiny_att_layer': -999, 'lr_init': 0.0005, 'lr_final': 5e-06, 'warmup_steps': 20, 'beta1': 0.9, 'beta2': 0.95, 'adam_eps': 1e-18, 'grad_cp': 1, 'dropout': 0, 'weight_decay': 0.1, 'weight_decay_final': -1, 'my_pile_version': 1, 'my_pile_stage': 3, 'my_pile_shift': 0, 'my_pile_edecay': 0, 'layerwise_lr': 1, 'ds_bucket_mb': 2, 'my_sample_len': 0, 'my_ffn_shift': 1, 'my_att_shift': 1, 'head_size_a': 64, 'head_size_divisor': 8, 'my_pos_emb': 0, 'load_partial': 0, 'magic_prime': 23580569, 'my_qa_mask': 0, 'my_random_steps': 0, 'my_testing': 'x060', 'my_exit': 99999999, 'my_exit_tokens': 48293025696, 'logger': False, 'enable_checkpointing': False, 'default_root_dir': None, 'gradient_clip_val': 0.8, 'gradient_clip_algorithm': None, 'num_nodes': 1, 'num_processes': None, 'devices': '1', 'gpus': None, 'auto_select_gpus': None, 'tpu_cores': None, 'ipus': None, 'enable_progress_bar': True, 'overfit_batches': 0.0, 'track_grad_norm': -1, 'check_val_every_n_epoch': 100000000000000000000, 'fast_dev_run': False, 'accumulate_grad_batches': None, 'max_epochs': -1, 'min_epochs': None, 'max_steps': -1, 'min_steps': None, 'max_time': None, 'limit_train_batches': None, 'limit_val_batches': None, 'limit_test_batches': None, 'limit_predict_batches': None, 'val_check_interval': None, 'log_every_n_steps': 100000000000000000000, 'accelerator': 'gpu', 'strategy': 'deepspeed_stage_2', 'sync_batchnorm': False, 'precision': 'bf16', 'enable_model_summary': True, 'num_sanity_val_steps': 0, 'resume_from_checkpoint': None, 'profiler': None, 'benchmark': None, 'reload_dataloaders_every_n_epochs': 0, 'auto_lr_find': False, 'replace_sampler_ddp': False, 'detect_anomaly': False, 'auto_scale_batch_size': False, 'plugins': None, 'amp_backend': None, 'amp_level': None, 'move_metrics_to_cpu': False, 'multiple_trainloader_mode': 'max_size_cycle', 'inference_mode': True, 'grad_clip': 0.8, 'my_timestamp': '2024-11-16-14-51-57', 'betas': (0.9, 0.95), 'real_bsz': 96, 'run_name': '8000 ctx2048 L12 D768', 'my_pile_prev_p': 303} +{'zero_allow_untested_optimizer': True, 'zero_optimization': {'stage': 2, 'contiguous_gradients': True, 'overlap_comm': True, 'allgather_partitions': True, 'reduce_scatter': True, 'allgather_bucket_size': 2000000, 'reduce_bucket_size': 2000000, 'sub_group_size': 1000000000000}, 'activation_checkpointing': {'partition_activations': False, 'cpu_checkpointing': False, 'contiguous_memory_optimization': False, 'synchronize_checkpoint_boundary': False}, 'aio': {'block_size': 1048576, 'queue_depth': 8, 'single_submit': False, 'overlap_events': True, 'thread_count': 1}, 'gradient_accumulation_steps': 1, 'train_micro_batch_size_per_gpu': 96, 'gradient_clipping': 0.8, 'bf16': {'enabled': True}} +305 2.189453 8.9303 0.00023449 2024-11-16 18:43:49.344059 0 +306 2.197991 9.0069 0.00023317 2024-11-16 22:35:26.362465 1 +307 2.190569 8.9403 0.00023184 2024-11-17 02:27:02.917167 2 +308 2.200465 9.0292 0.00023052 2024-11-17 06:18:39.071816 3 +309 2.191109 8.9451 0.00022919 2024-11-17 10:10:18.697806 4 +310 2.193601 8.9674 0.00022787 2024-11-17 14:01:58.288103 5 +311 2.199795 9.0232 0.00022655 2024-11-17 17:53:36.053466 6 +312 2.199684 9.0222 0.00022522 2024-11-17 21:45:15.641911 7 +313 2.191797 8.9513 0.00022390 2024-11-18 01:36:59.671833 8 +314 2.193824 8.9695 0.00022258 2024-11-18 05:28:36.268821 9 +315 2.200502 9.0295 0.00022126 2024-11-18 09:20:13.231885 10 +316 2.191834 8.9516 0.00021994 2024-11-18 13:11:51.366623 11 +317 2.190104 8.9361 0.00021863 2024-11-18 17:03:32.746412 12 +318 2.199275 9.0185 0.00021731 2024-11-18 20:55:08.377792 13 +319 2.196484 8.9933 0.00021599 2024-11-19 00:46:46.141182 14 +320 2.185733 8.8972 0.00021468 2024-11-19 04:38:22.785055 15 +321 2.192206 8.9549 0.00021337 2024-11-19 08:30:05.951145 16 +322 2.199740 9.0227 0.00021205 2024-11-19 12:21:41.995605 17 +323 2.195610 8.9855 0.00021074 2024-11-19 16:13:19.690052 18 +324 2.190699 8.9415 0.00020943 2024-11-19 20:04:56.823531 19 +325 2.197266 9.0004 0.00020812 2024-11-19 23:56:34.701085 20 +326 2.196708 8.9953 0.00020682 2024-11-20 03:48:16.524435 21 +327 2.193080 8.9628 0.00020551 2024-11-20 07:39:58.725742 22 +328 2.187165 8.9099 0.00020421 2024-11-20 11:31:37.579474 23 +NEW RUN 2024-11-20-11-59-34 +{'load_model': 'out/L12-D768-x060-assholerwkv/rwkv-328.pth', 'wandb': 'tami4', 'proj_dir': 'out/L12-D768-x060-assholerwkv', 'random_seed': -1, 'train_type': '', 'data_file': 'data/pol_tokenized', 'data_type': 'binidx', 'vocab_size': 8000, 'ctx_len': 2048, 'epoch_steps': 420, 'epoch_count': 584, 'epoch_begin': 329, 'epoch_save': 1, 'micro_bsz': 96, 'n_layer': 12, 'n_embd': 768, 'dim_att': 768, 'dim_ffn': 2688, 'pre_ffn': 0, 'head_qk': 0, 'tiny_att_dim': 0, 'tiny_att_layer': -999, 'lr_init': 0.0005, 'lr_final': 5e-06, 'warmup_steps': 20, 'beta1': 0.9, 'beta2': 0.95, 'adam_eps': 1e-18, 'grad_cp': 1, 'dropout': 0, 'weight_decay': 0.1, 'weight_decay_final': -1, 'my_pile_version': 1, 'my_pile_stage': 3, 'my_pile_shift': 0, 'my_pile_edecay': 0, 'layerwise_lr': 1, 'ds_bucket_mb': 2, 'my_sample_len': 0, 'my_ffn_shift': 1, 'my_att_shift': 1, 'head_size_a': 64, 'head_size_divisor': 8, 'my_pos_emb': 0, 'load_partial': 0, 'magic_prime': 23580569, 'my_qa_mask': 0, 'my_random_steps': 0, 'my_testing': 'x060', 'my_exit': 99999999, 'my_exit_tokens': 48293025696, 'logger': False, 'enable_checkpointing': False, 'default_root_dir': None, 'gradient_clip_val': 0.9, 'gradient_clip_algorithm': None, 'num_nodes': 1, 'num_processes': None, 'devices': '1', 'gpus': None, 'auto_select_gpus': None, 'tpu_cores': None, 'ipus': None, 'enable_progress_bar': True, 'overfit_batches': 0.0, 'track_grad_norm': -1, 'check_val_every_n_epoch': 100000000000000000000, 'fast_dev_run': False, 'accumulate_grad_batches': None, 'max_epochs': -1, 'min_epochs': None, 'max_steps': -1, 'min_steps': None, 'max_time': None, 'limit_train_batches': None, 'limit_val_batches': None, 'limit_test_batches': None, 'limit_predict_batches': None, 'val_check_interval': None, 'log_every_n_steps': 100000000000000000000, 'accelerator': 'gpu', 'strategy': 'deepspeed_stage_2', 'sync_batchnorm': False, 'precision': 'bf16', 'enable_model_summary': True, 'num_sanity_val_steps': 0, 'resume_from_checkpoint': None, 'profiler': None, 'benchmark': None, 'reload_dataloaders_every_n_epochs': 0, 'auto_lr_find': False, 'replace_sampler_ddp': False, 'detect_anomaly': False, 'auto_scale_batch_size': False, 'plugins': None, 'amp_backend': None, 'amp_level': None, 'move_metrics_to_cpu': False, 'multiple_trainloader_mode': 'max_size_cycle', 'inference_mode': True, 'grad_clip': 0.9, 'my_timestamp': '2024-11-20-11-59-34', 'betas': (0.9, 0.95), 'real_bsz': 96, 'run_name': '8000 ctx2048 L12 D768', 'my_pile_prev_p': 327} +{'zero_allow_untested_optimizer': True, 'zero_optimization': {'stage': 2, 'contiguous_gradients': True, 'overlap_comm': True, 'allgather_partitions': True, 'reduce_scatter': True, 'allgather_bucket_size': 2000000, 'reduce_bucket_size': 2000000, 'sub_group_size': 1000000000000}, 'activation_checkpointing': {'partition_activations': False, 'cpu_checkpointing': False, 'contiguous_memory_optimization': False, 'synchronize_checkpoint_boundary': False}, 'aio': {'block_size': 1048576, 'queue_depth': 8, 'single_submit': False, 'overlap_events': True, 'thread_count': 1}, 'gradient_accumulation_steps': 1, 'train_micro_batch_size_per_gpu': 96, 'gradient_clipping': 0.9, 'bf16': {'enabled': True}} +329 2.196131 8.9902 0.00020290 2024-11-20 15:51:22.301664 0 +330 2.189528 8.9310 0.00020160 2024-11-20 19:42:58.588114 1 +331 2.194475 8.9753 0.00020030 2024-11-20 23:34:41.549313 2 +332 2.188821 8.9247 0.00019900 2024-11-21 03:26:17.410231 3 +333 2.184282 8.8843 0.00019770 2024-11-21 07:17:53.141872 4 +334 2.196224 8.9910 0.00019641 2024-11-21 11:09:35.511870 5 +335 2.194922 8.9793 0.00019511 2024-11-21 15:01:11.133229 6 +336 2.193118 8.9631 0.00019382 2024-11-21 18:52:48.988102 7 +337 2.196131 8.9902 0.00019253 2024-11-21 22:44:25.266278 8 +338 2.191220 8.9461 0.00019124 2024-11-22 02:36:00.232557 9 +339 2.193955 8.9706 0.00018995 2024-11-22 06:27:37.895199 10 +340 2.193694 8.9683 0.00018867 2024-11-22 10:19:17.176566 11 +341 2.189397 8.9298 0.00018739 2024-11-22 14:11:03.769256 12 +342 2.183612 8.8783 0.00018610 2024-11-22 18:03:00.365872 13 +343 2.198214 9.0089 0.00018482 2024-11-22 21:54:35.509634 14 +344 2.186291 8.9021 0.00018355 2024-11-23 01:46:15.597037 15 +345 2.188002 8.9174 0.00018227 2024-11-23 05:37:51.654126 16 +346 2.194122 8.9721 0.00018100 2024-11-23 09:29:27.365795 17 +347 2.196931 8.9974 0.00017972 2024-11-23 13:21:04.608882 18 +348 2.187481 8.9127 0.00017845 2024-11-23 17:12:39.817879 19 +349 2.187184 8.9101 0.00017719 2024-11-23 21:04:19.587533 20 +350 2.189676 8.9323 0.00017592 2024-11-24 00:55:58.808421 21 +351 2.189825 8.9337 0.00017466 2024-11-24 04:47:37.230005 22 +352 2.186998 8.9084 0.00017340 2024-11-24 08:39:12.629759 23 +353 2.193155 8.9634 0.00017214 2024-11-24 12:30:49.021949 24 +354 2.187333 8.9114 0.00017088 2024-11-24 16:22:26.659353 25 +355 2.190811 8.9425 0.00016963 2024-11-24 20:14:01.361873 26 +356 2.187165 8.9099 0.00016838 2024-11-25 00:05:42.751493 27 +357 2.192615 8.9586 0.00016713 2024-11-25 03:57:19.249163 28 +358 2.187574 8.9136 0.00016588 2024-11-25 07:48:54.107451 29 +359 2.184245 8.8839 0.00016464 2024-11-25 11:40:35.497217 30 +360 2.185733 8.8972 0.00016339 2024-11-25 15:32:11.759838 31 +NEW RUN 2024-11-25-16-22-57 +{'load_model': 'out/L12-D768-x060-assholerwkv/rwkv-360.pth', 'wandb': 'tami4', 'proj_dir': 'out/L12-D768-x060-assholerwkv', 'random_seed': -1, 'train_type': '', 'data_file': 'data/pol_tokenized', 'data_type': 'binidx', 'vocab_size': 8000, 'ctx_len': 2048, 'epoch_steps': 420, 'epoch_count': 584, 'epoch_begin': 361, 'epoch_save': 1, 'micro_bsz': 96, 'n_layer': 12, 'n_embd': 768, 'dim_att': 768, 'dim_ffn': 2688, 'pre_ffn': 0, 'head_qk': 0, 'tiny_att_dim': 0, 'tiny_att_layer': -999, 'lr_init': 0.0005, 'lr_final': 5e-06, 'warmup_steps': 20, 'beta1': 0.9, 'beta2': 0.95, 'adam_eps': 1e-18, 'grad_cp': 1, 'dropout': 0, 'weight_decay': 0.1, 'weight_decay_final': -1, 'my_pile_version': 1, 'my_pile_stage': 3, 'my_pile_shift': 0, 'my_pile_edecay': 0, 'layerwise_lr': 1, 'ds_bucket_mb': 2, 'my_sample_len': 0, 'my_ffn_shift': 1, 'my_att_shift': 1, 'head_size_a': 64, 'head_size_divisor': 8, 'my_pos_emb': 0, 'load_partial': 0, 'magic_prime': 23580569, 'my_qa_mask': 0, 'my_random_steps': 0, 'my_testing': 'x060', 'my_exit': 99999999, 'my_exit_tokens': 48293025696, 'logger': False, 'enable_checkpointing': False, 'default_root_dir': None, 'gradient_clip_val': 1.0, 'gradient_clip_algorithm': None, 'num_nodes': 1, 'num_processes': None, 'devices': '1', 'gpus': None, 'auto_select_gpus': None, 'tpu_cores': None, 'ipus': None, 'enable_progress_bar': True, 'overfit_batches': 0.0, 'track_grad_norm': -1, 'check_val_every_n_epoch': 100000000000000000000, 'fast_dev_run': False, 'accumulate_grad_batches': None, 'max_epochs': -1, 'min_epochs': None, 'max_steps': -1, 'min_steps': None, 'max_time': None, 'limit_train_batches': None, 'limit_val_batches': None, 'limit_test_batches': None, 'limit_predict_batches': None, 'val_check_interval': None, 'log_every_n_steps': 100000000000000000000, 'accelerator': 'gpu', 'strategy': 'deepspeed_stage_2', 'sync_batchnorm': False, 'precision': 'bf16', 'enable_model_summary': True, 'num_sanity_val_steps': 0, 'resume_from_checkpoint': None, 'profiler': None, 'benchmark': None, 'reload_dataloaders_every_n_epochs': 0, 'auto_lr_find': False, 'replace_sampler_ddp': False, 'detect_anomaly': False, 'auto_scale_batch_size': False, 'plugins': None, 'amp_backend': None, 'amp_level': None, 'move_metrics_to_cpu': False, 'multiple_trainloader_mode': 'max_size_cycle', 'inference_mode': True, 'grad_clip': 1.0, 'my_timestamp': '2024-11-25-16-22-57', 'betas': (0.9, 0.95), 'real_bsz': 96, 'run_name': '8000 ctx2048 L12 D768', 'my_pile_prev_p': 359} +{'zero_allow_untested_optimizer': True, 'zero_optimization': {'stage': 2, 'contiguous_gradients': True, 'overlap_comm': True, 'allgather_partitions': True, 'reduce_scatter': True, 'allgather_bucket_size': 2000000, 'reduce_bucket_size': 2000000, 'sub_group_size': 1000000000000}, 'activation_checkpointing': {'partition_activations': False, 'cpu_checkpointing': False, 'contiguous_memory_optimization': False, 'synchronize_checkpoint_boundary': False}, 'aio': {'block_size': 1048576, 'queue_depth': 8, 'single_submit': False, 'overlap_events': True, 'thread_count': 1}, 'gradient_accumulation_steps': 1, 'train_micro_batch_size_per_gpu': 96, 'gradient_clipping': 1.0, 'bf16': {'enabled': True}} +361 2.183631 8.8785 0.00016215 2024-11-25 20:14:46.088646 0 +362 2.185677 8.8967 0.00016092 2024-11-26 00:06:21.102065 1 +363 2.183017 8.8730 0.00015968 2024-11-26 03:57:59.216597 2 +364 2.186979 8.9083 0.00015845 2024-11-26 07:49:37.017550 3 +365 2.184449 8.8858 0.00015722 2024-11-26 11:41:24.660713 4 +366 2.185547 8.8955 0.00015600 2024-11-26 15:33:00.877469 5 +367 2.184952 8.8902 0.00015478 2024-11-26 19:24:52.695798 6 +368 2.184840 8.8892 0.00015356 2024-11-26 23:16:28.528355 7 +369 2.186403 8.9031 0.00015234 2024-11-27 03:08:16.100704 8 +370 2.192188 8.9548 0.00015112 2024-11-27 06:59:59.320076 9 +371 2.193043 8.9624 0.00014991 2024-11-27 10:51:33.091873 10 +372 2.188876 8.9252 0.00014870 2024-11-27 14:43:09.460757 11 +373 2.179557 8.8424 0.00014750 2024-11-27 18:34:47.542157 12 +374 2.189416 8.9300 0.00014630 2024-11-27 22:26:22.854484 13 +375 2.183315 8.8757 0.00014510 2024-11-28 02:18:07.356776 14 +376 2.184226 8.8838 0.00014390 2024-11-28 06:09:42.734138 15 +377 2.179799 8.8445 0.00014271 2024-11-28 10:01:19.497651 16 +378 2.179818 8.8447 0.00014152 2024-11-28 13:52:55.835804 17 +379 2.177939 8.8281 0.00014033 2024-11-28 17:44:32.463213 18 +380 2.183408 8.8765 0.00013915 2024-11-28 21:36:09.306672 19 +381 2.180952 8.8547 0.00013797 2024-11-29 01:27:55.796968 20 +382 2.181027 8.8554 0.00013679 2024-11-29 05:19:32.480852 21 +383 2.180748 8.8529 0.00013562 2024-11-29 09:11:14.246443 22 +384 2.184133 8.8829 0.00013444 2024-11-29 13:02:51.432276 23 +385 2.183836 8.8803 0.00013328 2024-11-29 16:54:28.237535 24 +386 2.179669 8.8434 0.00013211 2024-11-29 20:46:05.657423 25 +387 2.186031 8.8998 0.00013095 2024-11-30 00:37:41.132471 26 +388 2.181696 8.8613 0.00012980 2024-11-30 04:29:28.832040 27 +389 2.183724 8.8793 0.00012864 2024-11-30 08:21:10.639453 28 +390 2.179129 8.8386 0.00012750 2024-11-30 12:12:48.712436 29 +391 2.181417 8.8589 0.00012635 2024-11-30 16:04:24.805511 30 +392 2.185119 8.8917 0.00012521 2024-11-30 19:56:15.782816 31 +NEW RUN 2024-12-02-23-16-58 +{'load_model': 'out/L12-D768-x060-assholerwkv/rwkv-392.pth', 'wandb': 'tami4', 'proj_dir': 'out/L12-D768-x060-assholerwkv', 'random_seed': -1, 'train_type': '', 'data_file': 'data/pol_tokenized', 'data_type': 'binidx', 'vocab_size': 8000, 'ctx_len': 2048, 'epoch_steps': 420, 'epoch_count': 584, 'epoch_begin': 393, 'epoch_save': 1, 'micro_bsz': 96, 'n_layer': 12, 'n_embd': 768, 'dim_att': 768, 'dim_ffn': 2688, 'pre_ffn': 0, 'head_qk': 0, 'tiny_att_dim': 0, 'tiny_att_layer': -999, 'lr_init': 0.0005, 'lr_final': 5e-06, 'warmup_steps': 20, 'beta1': 0.9, 'beta2': 0.95, 'adam_eps': 1e-18, 'grad_cp': 1, 'dropout': 0, 'weight_decay': 0.1, 'weight_decay_final': -1, 'my_pile_version': 1, 'my_pile_stage': 3, 'my_pile_shift': 0, 'my_pile_edecay': 0, 'layerwise_lr': 1, 'ds_bucket_mb': 2, 'my_sample_len': 0, 'my_ffn_shift': 1, 'my_att_shift': 1, 'head_size_a': 64, 'head_size_divisor': 8, 'my_pos_emb': 0, 'load_partial': 0, 'magic_prime': 23580569, 'my_qa_mask': 0, 'my_random_steps': 0, 'my_testing': 'x060', 'my_exit': 99999999, 'my_exit_tokens': 48293025696, 'logger': False, 'enable_checkpointing': False, 'default_root_dir': None, 'gradient_clip_val': 1.0, 'gradient_clip_algorithm': None, 'num_nodes': 1, 'num_processes': None, 'devices': '1', 'gpus': None, 'auto_select_gpus': None, 'tpu_cores': None, 'ipus': None, 'enable_progress_bar': True, 'overfit_batches': 0.0, 'track_grad_norm': -1, 'check_val_every_n_epoch': 100000000000000000000, 'fast_dev_run': False, 'accumulate_grad_batches': None, 'max_epochs': -1, 'min_epochs': None, 'max_steps': -1, 'min_steps': None, 'max_time': None, 'limit_train_batches': None, 'limit_val_batches': None, 'limit_test_batches': None, 'limit_predict_batches': None, 'val_check_interval': None, 'log_every_n_steps': 100000000000000000000, 'accelerator': 'gpu', 'strategy': 'deepspeed_stage_2', 'sync_batchnorm': False, 'precision': 'bf16', 'enable_model_summary': True, 'num_sanity_val_steps': 0, 'resume_from_checkpoint': None, 'profiler': None, 'benchmark': None, 'reload_dataloaders_every_n_epochs': 0, 'auto_lr_find': False, 'replace_sampler_ddp': False, 'detect_anomaly': False, 'auto_scale_batch_size': False, 'plugins': None, 'amp_backend': None, 'amp_level': None, 'move_metrics_to_cpu': False, 'multiple_trainloader_mode': 'max_size_cycle', 'inference_mode': True, 'grad_clip': 1.0, 'my_timestamp': '2024-12-02-23-16-58', 'betas': (0.9, 0.95), 'real_bsz': 96, 'run_name': '8000 ctx2048 L12 D768', 'my_pile_prev_p': 391} +{'zero_allow_untested_optimizer': True, 'zero_optimization': {'stage': 2, 'contiguous_gradients': True, 'overlap_comm': True, 'allgather_partitions': True, 'reduce_scatter': True, 'allgather_bucket_size': 2000000, 'reduce_bucket_size': 2000000, 'sub_group_size': 1000000000000}, 'activation_checkpointing': {'partition_activations': False, 'cpu_checkpointing': False, 'contiguous_memory_optimization': False, 'synchronize_checkpoint_boundary': False}, 'aio': {'block_size': 1048576, 'queue_depth': 8, 'single_submit': False, 'overlap_events': True, 'thread_count': 1}, 'gradient_accumulation_steps': 1, 'train_micro_batch_size_per_gpu': 96, 'gradient_clipping': 1.0, 'bf16': {'enabled': True}} +NEW RUN 2024-12-03-11-44-17 +{'load_model': 'out/L12-D768-x060-assholerwkv/rwkv-392.pth', 'wandb': 'tami4', 'proj_dir': 'out/L12-D768-x060-assholerwkv', 'random_seed': -1, 'train_type': '', 'data_file': 'data/pol_tokenized', 'data_type': 'binidx', 'vocab_size': 8000, 'ctx_len': 2048, 'epoch_steps': 420, 'epoch_count': 584, 'epoch_begin': 393, 'epoch_save': 1, 'micro_bsz': 96, 'n_layer': 12, 'n_embd': 768, 'dim_att': 768, 'dim_ffn': 2688, 'pre_ffn': 0, 'head_qk': 0, 'tiny_att_dim': 0, 'tiny_att_layer': -999, 'lr_init': 0.0005, 'lr_final': 5e-06, 'warmup_steps': 20, 'beta1': 0.9, 'beta2': 0.95, 'adam_eps': 1e-18, 'grad_cp': 1, 'dropout': 0, 'weight_decay': 0.1, 'weight_decay_final': -1, 'my_pile_version': 1, 'my_pile_stage': 3, 'my_pile_shift': 0, 'my_pile_edecay': 0, 'layerwise_lr': 1, 'ds_bucket_mb': 2, 'my_sample_len': 0, 'my_ffn_shift': 1, 'my_att_shift': 1, 'head_size_a': 64, 'head_size_divisor': 8, 'my_pos_emb': 0, 'load_partial': 0, 'magic_prime': 23580569, 'my_qa_mask': 0, 'my_random_steps': 0, 'my_testing': 'x060', 'my_exit': 99999999, 'my_exit_tokens': 48293025696, 'logger': False, 'enable_checkpointing': False, 'default_root_dir': None, 'gradient_clip_val': 1.0, 'gradient_clip_algorithm': None, 'num_nodes': 1, 'num_processes': None, 'devices': '1', 'gpus': None, 'auto_select_gpus': None, 'tpu_cores': None, 'ipus': None, 'enable_progress_bar': True, 'overfit_batches': 0.0, 'track_grad_norm': -1, 'check_val_every_n_epoch': 100000000000000000000, 'fast_dev_run': False, 'accumulate_grad_batches': None, 'max_epochs': -1, 'min_epochs': None, 'max_steps': -1, 'min_steps': None, 'max_time': None, 'limit_train_batches': None, 'limit_val_batches': None, 'limit_test_batches': None, 'limit_predict_batches': None, 'val_check_interval': None, 'log_every_n_steps': 100000000000000000000, 'accelerator': 'gpu', 'strategy': 'deepspeed_stage_2', 'sync_batchnorm': False, 'precision': 'bf16', 'enable_model_summary': True, 'num_sanity_val_steps': 0, 'resume_from_checkpoint': None, 'profiler': None, 'benchmark': None, 'reload_dataloaders_every_n_epochs': 0, 'auto_lr_find': False, 'replace_sampler_ddp': False, 'detect_anomaly': False, 'auto_scale_batch_size': False, 'plugins': None, 'amp_backend': None, 'amp_level': None, 'move_metrics_to_cpu': False, 'multiple_trainloader_mode': 'max_size_cycle', 'inference_mode': True, 'grad_clip': 1.0, 'my_timestamp': '2024-12-03-11-44-17', 'betas': (0.9, 0.95), 'real_bsz': 96, 'run_name': '8000 ctx2048 L12 D768', 'my_pile_prev_p': 391} +{'zero_allow_untested_optimizer': True, 'zero_optimization': {'stage': 2, 'contiguous_gradients': True, 'overlap_comm': True, 'allgather_partitions': True, 'reduce_scatter': True, 'allgather_bucket_size': 2000000, 'reduce_bucket_size': 2000000, 'sub_group_size': 1000000000000}, 'activation_checkpointing': {'partition_activations': False, 'cpu_checkpointing': False, 'contiguous_memory_optimization': False, 'synchronize_checkpoint_boundary': False}, 'aio': {'block_size': 1048576, 'queue_depth': 8, 'single_submit': False, 'overlap_events': True, 'thread_count': 1}, 'gradient_accumulation_steps': 1, 'train_micro_batch_size_per_gpu': 96, 'gradient_clipping': 1.0, 'bf16': {'enabled': True}} +NEW RUN 2024-12-03-12-17-57 +{'load_model': 'out/L12-D768-x060-assholerwkv/rwkv-392.pth', 'wandb': 'tami4', 'proj_dir': 'out/L12-D768-x060-assholerwkv', 'random_seed': -1, 'train_type': '', 'data_file': 'data/pol_tokenized', 'data_type': 'binidx', 'vocab_size': 8000, 'ctx_len': 2048, 'epoch_steps': 420, 'epoch_count': 584, 'epoch_begin': 393, 'epoch_save': 1, 'micro_bsz': 96, 'n_layer': 12, 'n_embd': 768, 'dim_att': 768, 'dim_ffn': 2688, 'pre_ffn': 0, 'head_qk': 0, 'tiny_att_dim': 0, 'tiny_att_layer': -999, 'lr_init': 0.0005, 'lr_final': 5e-06, 'warmup_steps': 20, 'beta1': 0.9, 'beta2': 0.95, 'adam_eps': 1e-18, 'grad_cp': 1, 'dropout': 0, 'weight_decay': 0.1, 'weight_decay_final': -1, 'my_pile_version': 1, 'my_pile_stage': 3, 'my_pile_shift': 0, 'my_pile_edecay': 0, 'layerwise_lr': 1, 'ds_bucket_mb': 2, 'my_sample_len': 0, 'my_ffn_shift': 1, 'my_att_shift': 1, 'head_size_a': 64, 'head_size_divisor': 8, 'my_pos_emb': 0, 'load_partial': 0, 'magic_prime': 23580569, 'my_qa_mask': 0, 'my_random_steps': 0, 'my_testing': 'x060', 'my_exit': 99999999, 'my_exit_tokens': 48293025696, 'logger': False, 'enable_checkpointing': False, 'default_root_dir': None, 'gradient_clip_val': 1.0, 'gradient_clip_algorithm': None, 'num_nodes': 1, 'num_processes': None, 'devices': '1', 'gpus': None, 'auto_select_gpus': None, 'tpu_cores': None, 'ipus': None, 'enable_progress_bar': True, 'overfit_batches': 0.0, 'track_grad_norm': -1, 'check_val_every_n_epoch': 100000000000000000000, 'fast_dev_run': False, 'accumulate_grad_batches': None, 'max_epochs': -1, 'min_epochs': None, 'max_steps': -1, 'min_steps': None, 'max_time': None, 'limit_train_batches': None, 'limit_val_batches': None, 'limit_test_batches': None, 'limit_predict_batches': None, 'val_check_interval': None, 'log_every_n_steps': 100000000000000000000, 'accelerator': 'gpu', 'strategy': 'deepspeed_stage_2', 'sync_batchnorm': False, 'precision': 'bf16', 'enable_model_summary': True, 'num_sanity_val_steps': 0, 'resume_from_checkpoint': None, 'profiler': None, 'benchmark': None, 'reload_dataloaders_every_n_epochs': 0, 'auto_lr_find': False, 'replace_sampler_ddp': False, 'detect_anomaly': False, 'auto_scale_batch_size': False, 'plugins': None, 'amp_backend': None, 'amp_level': None, 'move_metrics_to_cpu': False, 'multiple_trainloader_mode': 'max_size_cycle', 'inference_mode': True, 'grad_clip': 1.0, 'my_timestamp': '2024-12-03-12-17-57', 'betas': (0.9, 0.95), 'real_bsz': 96, 'run_name': '8000 ctx2048 L12 D768', 'my_pile_prev_p': 391} +{'zero_allow_untested_optimizer': True, 'zero_optimization': {'stage': 2, 'contiguous_gradients': True, 'overlap_comm': True, 'allgather_partitions': True, 'reduce_scatter': True, 'allgather_bucket_size': 2000000, 'reduce_bucket_size': 2000000, 'sub_group_size': 1000000000000}, 'activation_checkpointing': {'partition_activations': False, 'cpu_checkpointing': False, 'contiguous_memory_optimization': False, 'synchronize_checkpoint_boundary': False}, 'aio': {'block_size': 1048576, 'queue_depth': 8, 'single_submit': False, 'overlap_events': True, 'thread_count': 1}, 'gradient_accumulation_steps': 1, 'train_micro_batch_size_per_gpu': 96, 'gradient_clipping': 1.0, 'bf16': {'enabled': True}} +NEW RUN 2024-12-03-18-56-19 +{'load_model': 'out/L12-D768-x060-assholerwkv/rwkv-392.pth', 'wandb': 'tami4', 'proj_dir': 'out/L12-D768-x060-assholerwkv', 'random_seed': -1, 'train_type': '', 'data_file': 'data/pol_tokenized', 'data_type': 'binidx', 'vocab_size': 8000, 'ctx_len': 2048, 'epoch_steps': 420, 'epoch_count': 584, 'epoch_begin': 393, 'epoch_save': 1, 'micro_bsz': 96, 'n_layer': 12, 'n_embd': 768, 'dim_att': 768, 'dim_ffn': 2688, 'pre_ffn': 0, 'head_qk': 0, 'tiny_att_dim': 0, 'tiny_att_layer': -999, 'lr_init': 0.0005, 'lr_final': 5e-06, 'warmup_steps': 20, 'beta1': 0.9, 'beta2': 0.95, 'adam_eps': 1e-18, 'grad_cp': 1, 'dropout': 0, 'weight_decay': 0.1, 'weight_decay_final': -1, 'my_pile_version': 1, 'my_pile_stage': 3, 'my_pile_shift': 0, 'my_pile_edecay': 0, 'layerwise_lr': 1, 'ds_bucket_mb': 2, 'my_sample_len': 0, 'my_ffn_shift': 1, 'my_att_shift': 1, 'head_size_a': 64, 'head_size_divisor': 8, 'my_pos_emb': 0, 'load_partial': 0, 'magic_prime': 23580569, 'my_qa_mask': 0, 'my_random_steps': 0, 'my_testing': 'x060', 'my_exit': 99999999, 'my_exit_tokens': 48293025696, 'logger': False, 'enable_checkpointing': False, 'default_root_dir': None, 'gradient_clip_val': 1.0, 'gradient_clip_algorithm': None, 'num_nodes': 1, 'num_processes': None, 'devices': '1', 'gpus': None, 'auto_select_gpus': None, 'tpu_cores': None, 'ipus': None, 'enable_progress_bar': True, 'overfit_batches': 0.0, 'track_grad_norm': -1, 'check_val_every_n_epoch': 100000000000000000000, 'fast_dev_run': False, 'accumulate_grad_batches': None, 'max_epochs': -1, 'min_epochs': None, 'max_steps': -1, 'min_steps': None, 'max_time': None, 'limit_train_batches': None, 'limit_val_batches': None, 'limit_test_batches': None, 'limit_predict_batches': None, 'val_check_interval': None, 'log_every_n_steps': 100000000000000000000, 'accelerator': 'gpu', 'strategy': 'deepspeed_stage_2', 'sync_batchnorm': False, 'precision': 'bf16', 'enable_model_summary': True, 'num_sanity_val_steps': 0, 'resume_from_checkpoint': None, 'profiler': None, 'benchmark': None, 'reload_dataloaders_every_n_epochs': 0, 'auto_lr_find': False, 'replace_sampler_ddp': False, 'detect_anomaly': False, 'auto_scale_batch_size': False, 'plugins': None, 'amp_backend': None, 'amp_level': None, 'move_metrics_to_cpu': False, 'multiple_trainloader_mode': 'max_size_cycle', 'inference_mode': True, 'grad_clip': 1.0, 'my_timestamp': '2024-12-03-18-56-19', 'betas': (0.9, 0.95), 'real_bsz': 96, 'run_name': '8000 ctx2048 L12 D768', 'my_pile_prev_p': 391} +{'zero_allow_untested_optimizer': True, 'zero_optimization': {'stage': 2, 'contiguous_gradients': True, 'overlap_comm': True, 'allgather_partitions': True, 'reduce_scatter': True, 'allgather_bucket_size': 2000000, 'reduce_bucket_size': 2000000, 'sub_group_size': 1000000000000}, 'activation_checkpointing': {'partition_activations': False, 'cpu_checkpointing': False, 'contiguous_memory_optimization': False, 'synchronize_checkpoint_boundary': False}, 'aio': {'block_size': 1048576, 'queue_depth': 8, 'single_submit': False, 'overlap_events': True, 'thread_count': 1}, 'gradient_accumulation_steps': 1, 'train_micro_batch_size_per_gpu': 96, 'gradient_clipping': 1.0, 'bf16': {'enabled': True}} +393 2.176749 8.8176 0.00012407 2024-12-03 22:48:15.147657 0 +394 2.178571 8.8337 0.00012293 2024-12-04 02:40:02.791358 1 +395 2.176376 8.8143 0.00012180 2024-12-04 06:31:42.271274 2 +396 2.180618 8.8518 0.00012068 2024-12-04 10:23:24.701219 3 +397 2.172675 8.7817 0.00011955 2024-12-04 14:15:06.113990 4 +398 2.181715 8.8615 0.00011843 2024-12-04 18:06:52.816435 5 +399 2.174647 8.7991 0.00011732 2024-12-04 21:58:32.859525 6 +400 2.174200 8.7951 0.00011621 2024-12-05 01:50:23.834511 7 +401 2.175186 8.8038 0.00011510 2024-12-05 05:42:06.404790 8 +402 2.176990 8.8197 0.00011399 2024-12-05 09:33:48.445011 9 +403 2.177288 8.8223 0.00011289 2024-12-05 13:25:29.070893 10 +404 2.171856 8.7746 0.00011180 2024-12-05 17:17:11.246894 11 +405 2.171522 8.7716 0.00011071 2024-12-05 21:08:53.155997 12 +406 2.177753 8.8265 0.00010962 2024-12-06 01:00:31.934975 13 +407 2.171559 8.7719 0.00010854 2024-12-06 04:52:17.070847 14 +408 2.171689 8.7731 0.00010746 2024-12-06 08:43:57.101929 15 +409 2.180469 8.8505 0.00010638 2024-12-06 12:35:49.934638 16 +410 2.175112 8.8032 0.00010531 2024-12-06 16:27:31.388207 17 +411 2.166946 8.7316 0.00010424 2024-12-06 20:19:14.334485 18 +412 2.175316 8.8050 0.00010318 2024-12-07 00:10:59.535747 19 +413 2.176990 8.8197 0.00010212 2024-12-07 04:02:40.424567 20 +414 2.172563 8.7808 0.00010107 2024-12-07 07:54:29.463284 21 +415 2.162593 8.6937 0.00010002 2024-12-07 11:46:09.586002 22 +416 2.170685 8.7643 0.00009897 2024-12-07 15:37:51.628679 23 +417 2.173382 8.7880 0.00009793 2024-12-07 19:29:36.703860 24 +418 2.175781 8.8091 0.00009690 2024-12-07 23:21:16.081472 25 +419 2.165885 8.7223 0.00009586 2024-12-08 03:13:01.072341 26 +420 2.167374 8.7353 0.00009484 2024-12-08 07:04:47.579088 27 +421 2.167448 8.7360 0.00009381 2024-12-08 10:56:27.472205 28 +422 2.172842 8.7832 0.00009280 2024-12-08 14:48:10.454581 29 +423 2.160454 8.6751 0.00009178 2024-12-08 18:39:49.724189 30 +424 2.169327 8.7524 0.00009077 2024-12-08 22:31:30.061023 31 +425 2.164881 8.7136 0.00008977 2024-12-09 02:23:09.841189 32 +426 2.167690 8.7381 0.00008877 2024-12-09 06:14:48.818175 33 +427 2.165086 8.7153 0.00008778 2024-12-09 10:06:37.142897 34 +428 2.166964 8.7317 0.00008679 2024-12-09 13:58:21.745502 35 +429 2.162537 8.6932 0.00008580 2024-12-09 17:50:08.807902 36 +NEW RUN 2024-12-09-20-19-57 +{'load_model': 'out/L12-D768-x060-assholerwkv/rwkv-429.pth', 'wandb': 'tami4', 'proj_dir': 'out/L12-D768-x060-assholerwkv', 'random_seed': -1, 'train_type': '', 'data_file': 'data/pol_tokenized', 'data_type': 'binidx', 'vocab_size': 8000, 'ctx_len': 2048, 'epoch_steps': 420, 'epoch_count': 584, 'epoch_begin': 430, 'epoch_save': 1, 'micro_bsz': 96, 'n_layer': 12, 'n_embd': 768, 'dim_att': 768, 'dim_ffn': 2688, 'pre_ffn': 0, 'head_qk': 0, 'tiny_att_dim': 0, 'tiny_att_layer': -999, 'lr_init': 0.0005, 'lr_final': 5e-06, 'warmup_steps': 20, 'beta1': 0.9, 'beta2': 0.95, 'adam_eps': 1e-18, 'grad_cp': 1, 'dropout': 0, 'weight_decay': 0.1, 'weight_decay_final': -1, 'my_pile_version': 1, 'my_pile_stage': 3, 'my_pile_shift': 0, 'my_pile_edecay': 0, 'layerwise_lr': 1, 'ds_bucket_mb': 2, 'my_sample_len': 0, 'my_ffn_shift': 1, 'my_att_shift': 1, 'head_size_a': 64, 'head_size_divisor': 8, 'my_pos_emb': 0, 'load_partial': 0, 'magic_prime': 23580569, 'my_qa_mask': 0, 'my_random_steps': 0, 'my_testing': 'x060', 'my_exit': 99999999, 'my_exit_tokens': 48293025696, 'logger': False, 'enable_checkpointing': False, 'default_root_dir': None, 'gradient_clip_val': 1.0, 'gradient_clip_algorithm': None, 'num_nodes': 1, 'num_processes': None, 'devices': '1', 'gpus': None, 'auto_select_gpus': None, 'tpu_cores': None, 'ipus': None, 'enable_progress_bar': True, 'overfit_batches': 0.0, 'track_grad_norm': -1, 'check_val_every_n_epoch': 100000000000000000000, 'fast_dev_run': False, 'accumulate_grad_batches': None, 'max_epochs': -1, 'min_epochs': None, 'max_steps': -1, 'min_steps': None, 'max_time': None, 'limit_train_batches': None, 'limit_val_batches': None, 'limit_test_batches': None, 'limit_predict_batches': None, 'val_check_interval': None, 'log_every_n_steps': 100000000000000000000, 'accelerator': 'gpu', 'strategy': 'deepspeed_stage_2', 'sync_batchnorm': False, 'precision': 'bf16', 'enable_model_summary': True, 'num_sanity_val_steps': 0, 'resume_from_checkpoint': None, 'profiler': None, 'benchmark': None, 'reload_dataloaders_every_n_epochs': 0, 'auto_lr_find': False, 'replace_sampler_ddp': False, 'detect_anomaly': False, 'auto_scale_batch_size': False, 'plugins': None, 'amp_backend': None, 'amp_level': None, 'move_metrics_to_cpu': False, 'multiple_trainloader_mode': 'max_size_cycle', 'inference_mode': True, 'grad_clip': 1.0, 'my_timestamp': '2024-12-09-20-19-57', 'betas': (0.9, 0.95), 'real_bsz': 96, 'run_name': '8000 ctx2048 L12 D768', 'my_pile_prev_p': 428} +{'zero_allow_untested_optimizer': True, 'zero_optimization': {'stage': 2, 'contiguous_gradients': True, 'overlap_comm': True, 'allgather_partitions': True, 'reduce_scatter': True, 'allgather_bucket_size': 2000000, 'reduce_bucket_size': 2000000, 'sub_group_size': 1000000000000}, 'activation_checkpointing': {'partition_activations': False, 'cpu_checkpointing': False, 'contiguous_memory_optimization': False, 'synchronize_checkpoint_boundary': False}, 'aio': {'block_size': 1048576, 'queue_depth': 8, 'single_submit': False, 'overlap_events': True, 'thread_count': 1}, 'gradient_accumulation_steps': 1, 'train_micro_batch_size_per_gpu': 96, 'gradient_clipping': 1.0, 'bf16': {'enabled': True}} +NEW RUN 2024-12-09-21-32-38 +{'load_model': 'out/L12-D768-x060-assholerwkv/rwkv-429.pth', 'wandb': 'tami4', 'proj_dir': 'out/L12-D768-x060-assholerwkv', 'random_seed': -1, 'train_type': '', 'data_file': 'data/pol_tokenized', 'data_type': 'binidx', 'vocab_size': 8000, 'ctx_len': 2048, 'epoch_steps': 420, 'epoch_count': 584, 'epoch_begin': 430, 'epoch_save': 1, 'micro_bsz': 96, 'n_layer': 12, 'n_embd': 768, 'dim_att': 768, 'dim_ffn': 2688, 'pre_ffn': 0, 'head_qk': 0, 'tiny_att_dim': 0, 'tiny_att_layer': -999, 'lr_init': 0.0005, 'lr_final': 5e-06, 'warmup_steps': 20, 'beta1': 0.9, 'beta2': 0.95, 'adam_eps': 1e-18, 'grad_cp': 1, 'dropout': 0, 'weight_decay': 0.1, 'weight_decay_final': -1, 'my_pile_version': 1, 'my_pile_stage': 3, 'my_pile_shift': 0, 'my_pile_edecay': 0, 'layerwise_lr': 1, 'ds_bucket_mb': 2, 'my_sample_len': 0, 'my_ffn_shift': 1, 'my_att_shift': 1, 'head_size_a': 64, 'head_size_divisor': 8, 'my_pos_emb': 0, 'load_partial': 0, 'magic_prime': 23580569, 'my_qa_mask': 0, 'my_random_steps': 0, 'my_testing': 'x060', 'my_exit': 99999999, 'my_exit_tokens': 48293025696, 'logger': False, 'enable_checkpointing': False, 'default_root_dir': None, 'gradient_clip_val': 1.0, 'gradient_clip_algorithm': None, 'num_nodes': 1, 'num_processes': None, 'devices': '1', 'gpus': None, 'auto_select_gpus': None, 'tpu_cores': None, 'ipus': None, 'enable_progress_bar': True, 'overfit_batches': 0.0, 'track_grad_norm': -1, 'check_val_every_n_epoch': 100000000000000000000, 'fast_dev_run': False, 'accumulate_grad_batches': None, 'max_epochs': -1, 'min_epochs': None, 'max_steps': -1, 'min_steps': None, 'max_time': None, 'limit_train_batches': None, 'limit_val_batches': None, 'limit_test_batches': None, 'limit_predict_batches': None, 'val_check_interval': None, 'log_every_n_steps': 100000000000000000000, 'accelerator': 'gpu', 'strategy': 'deepspeed_stage_2', 'sync_batchnorm': False, 'precision': 'bf16', 'enable_model_summary': True, 'num_sanity_val_steps': 0, 'resume_from_checkpoint': None, 'profiler': None, 'benchmark': None, 'reload_dataloaders_every_n_epochs': 0, 'auto_lr_find': False, 'replace_sampler_ddp': False, 'detect_anomaly': False, 'auto_scale_batch_size': False, 'plugins': None, 'amp_backend': None, 'amp_level': None, 'move_metrics_to_cpu': False, 'multiple_trainloader_mode': 'max_size_cycle', 'inference_mode': True, 'grad_clip': 1.0, 'my_timestamp': '2024-12-09-21-32-38', 'betas': (0.9, 0.95), 'real_bsz': 96, 'run_name': '8000 ctx2048 L12 D768', 'my_pile_prev_p': 428} +{'zero_allow_untested_optimizer': True, 'zero_optimization': {'stage': 2, 'contiguous_gradients': True, 'overlap_comm': True, 'allgather_partitions': True, 'reduce_scatter': True, 'allgather_bucket_size': 2000000, 'reduce_bucket_size': 2000000, 'sub_group_size': 1000000000000}, 'activation_checkpointing': {'partition_activations': False, 'cpu_checkpointing': False, 'contiguous_memory_optimization': False, 'synchronize_checkpoint_boundary': False}, 'aio': {'block_size': 1048576, 'queue_depth': 8, 'single_submit': False, 'overlap_events': True, 'thread_count': 1}, 'gradient_accumulation_steps': 1, 'train_micro_batch_size_per_gpu': 96, 'gradient_clipping': 1.0, 'bf16': {'enabled': True}} +430 2.165997 8.7233 0.00008482 2024-12-10 01:24:41.668514 0 +431 2.165141 8.7158 0.00008384 2024-12-10 05:16:22.508090 1 +432 2.164751 8.7124 0.00008287 2024-12-10 09:08:06.167981 2 +433 2.164807 8.7129 0.00008191 2024-12-10 12:59:51.537265 3 +434 2.166574 8.7283 0.00008095 2024-12-10 16:51:31.275981 4 +435 2.165160 8.7160 0.00007999 2024-12-10 20:43:25.319898 5 +436 2.161365 8.6830 0.00007904 2024-12-11 00:35:04.629472 6 +437 2.164211 8.7077 0.00007809 2024-12-11 04:26:44.285042 7 +438 2.163318 8.7000 0.00007715 2024-12-11 08:18:22.567084 8 +439 2.163244 8.6993 0.00007622 2024-12-11 12:10:13.943164 9 +440 2.155097 8.6287 0.00007529 2024-12-11 16:01:53.721569 10 +441 2.163393 8.7006 0.00007436 2024-12-11 19:53:34.547418 11 +442 2.162630 8.6940 0.00007344 2024-12-11 23:45:15.989023 12 +443 2.160026 8.6714 0.00007253 2024-12-12 03:36:59.565257 13 +444 2.159189 8.6641 0.00007162 2024-12-12 07:28:40.834843 14 +445 2.161477 8.6840 0.00007071 2024-12-12 11:20:28.301040 15 +446 2.160863 8.6786 0.00006981 2024-12-12 15:12:10.389413 16 +447 2.158129 8.6549 0.00006892 2024-12-12 19:04:02.707749 17 +NEW RUN 2024-12-15-12-31-01 +{'load_model': 'out/L12-D768-x060-assholerwkv/rwkv-447.pth', 'wandb': 'tami4', 'proj_dir': 'out/L12-D768-x060-assholerwkv', 'random_seed': -1, 'train_type': '', 'data_file': 'data/pol_tokenized', 'data_type': 'binidx', 'vocab_size': 8000, 'ctx_len': 2048, 'epoch_steps': 420, 'epoch_count': 584, 'epoch_begin': 448, 'epoch_save': 1, 'micro_bsz': 96, 'n_layer': 12, 'n_embd': 768, 'dim_att': 768, 'dim_ffn': 2688, 'pre_ffn': 0, 'head_qk': 0, 'tiny_att_dim': 0, 'tiny_att_layer': -999, 'lr_init': 0.0005, 'lr_final': 5e-06, 'warmup_steps': 20, 'beta1': 0.9, 'beta2': 0.95, 'adam_eps': 1e-18, 'grad_cp': 1, 'dropout': 0, 'weight_decay': 0.1, 'weight_decay_final': -1, 'my_pile_version': 1, 'my_pile_stage': 3, 'my_pile_shift': 0, 'my_pile_edecay': 0, 'layerwise_lr': 1, 'ds_bucket_mb': 2, 'my_sample_len': 0, 'my_ffn_shift': 1, 'my_att_shift': 1, 'head_size_a': 64, 'head_size_divisor': 8, 'my_pos_emb': 0, 'load_partial': 0, 'magic_prime': 23580569, 'my_qa_mask': 0, 'my_random_steps': 0, 'my_testing': 'x060', 'my_exit': 99999999, 'my_exit_tokens': 48293025696, 'logger': False, 'enable_checkpointing': False, 'default_root_dir': None, 'gradient_clip_val': 1.0, 'gradient_clip_algorithm': None, 'num_nodes': 1, 'num_processes': None, 'devices': '1', 'gpus': None, 'auto_select_gpus': None, 'tpu_cores': None, 'ipus': None, 'enable_progress_bar': True, 'overfit_batches': 0.0, 'track_grad_norm': -1, 'check_val_every_n_epoch': 100000000000000000000, 'fast_dev_run': False, 'accumulate_grad_batches': None, 'max_epochs': -1, 'min_epochs': None, 'max_steps': -1, 'min_steps': None, 'max_time': None, 'limit_train_batches': None, 'limit_val_batches': None, 'limit_test_batches': None, 'limit_predict_batches': None, 'val_check_interval': None, 'log_every_n_steps': 100000000000000000000, 'accelerator': 'gpu', 'strategy': 'deepspeed_stage_2', 'sync_batchnorm': False, 'precision': 'bf16', 'enable_model_summary': True, 'num_sanity_val_steps': 0, 'resume_from_checkpoint': None, 'profiler': None, 'benchmark': None, 'reload_dataloaders_every_n_epochs': 0, 'auto_lr_find': False, 'replace_sampler_ddp': False, 'detect_anomaly': False, 'auto_scale_batch_size': False, 'plugins': None, 'amp_backend': None, 'amp_level': None, 'move_metrics_to_cpu': False, 'multiple_trainloader_mode': 'max_size_cycle', 'inference_mode': True, 'grad_clip': 1.0, 'my_timestamp': '2024-12-15-12-31-01', 'betas': (0.9, 0.95), 'real_bsz': 96, 'run_name': '8000 ctx2048 L12 D768', 'my_pile_prev_p': 446} +{'zero_allow_untested_optimizer': True, 'zero_optimization': {'stage': 2, 'contiguous_gradients': True, 'overlap_comm': True, 'allgather_partitions': True, 'reduce_scatter': True, 'allgather_bucket_size': 2000000, 'reduce_bucket_size': 2000000, 'sub_group_size': 1000000000000}, 'activation_checkpointing': {'partition_activations': False, 'cpu_checkpointing': False, 'contiguous_memory_optimization': False, 'synchronize_checkpoint_boundary': False}, 'aio': {'block_size': 1048576, 'queue_depth': 8, 'single_submit': False, 'overlap_events': True, 'thread_count': 1}, 'gradient_accumulation_steps': 1, 'train_micro_batch_size_per_gpu': 96, 'gradient_clipping': 1.0, 'bf16': {'enabled': True}} +448 2.164100 8.7068 0.00006803 2024-12-15 16:23:03.303743 0 +449 2.159896 8.6702 0.00006714 2024-12-15 20:14:43.161619 1 +450 2.156399 8.6400 0.00006627 2024-12-16 00:06:25.604764 2 +451 2.157199 8.6469 0.00006539 2024-12-16 03:58:05.393093 3 +452 2.159914 8.6704 0.00006452 2024-12-16 07:49:48.379952 4 +453 2.158259 8.6561 0.00006366 2024-12-16 11:41:27.538965 5 +454 2.154222 8.6212 0.00006281 2024-12-16 15:33:06.796426 6 +455 2.158092 8.6546 0.00006195 2024-12-16 19:24:50.131247 7 +456 2.159710 8.6686 0.00006111 2024-12-16 23:16:38.512296 8 +457 2.157645 8.6507 0.00006027 2024-12-17 03:08:18.202448 9 +458 2.152846 8.6093 0.00005943 2024-12-17 07:00:11.254106 10 +459 2.152195 8.6037 0.00005860 2024-12-17 10:51:57.762594 11 +460 2.155618 8.6332 0.00005778 2024-12-17 14:43:42.951704 12 +461 2.162370 8.6917 0.00005696 2024-12-17 18:35:21.718473 13 +462 2.152232 8.6040 0.00005615 2024-12-17 22:27:04.079020 14 +463 2.152939 8.6101 0.00005534 2024-12-18 02:18:47.172700 15 +464 2.153125 8.6117 0.00005454 2024-12-18 06:10:27.250655 16 +NEW RUN 2024-12-19-11-09-20 +{'load_model': 'out/L12-D768-x060-assholerwkv/rwkv-464.pth', 'wandb': 'tami4', 'proj_dir': 'out/L12-D768-x060-assholerwkv', 'random_seed': -1, 'train_type': '', 'data_file': 'data/pol_tokenized', 'data_type': 'binidx', 'vocab_size': 8000, 'ctx_len': 2048, 'epoch_steps': 420, 'epoch_count': 584, 'epoch_begin': 465, 'epoch_save': 1, 'micro_bsz': 96, 'n_layer': 12, 'n_embd': 768, 'dim_att': 768, 'dim_ffn': 2688, 'pre_ffn': 0, 'head_qk': 0, 'tiny_att_dim': 0, 'tiny_att_layer': -999, 'lr_init': 0.0005, 'lr_final': 5e-06, 'warmup_steps': 20, 'beta1': 0.9, 'beta2': 0.95, 'adam_eps': 1e-18, 'grad_cp': 1, 'dropout': 0, 'weight_decay': 0.1, 'weight_decay_final': -1, 'my_pile_version': 1, 'my_pile_stage': 3, 'my_pile_shift': 0, 'my_pile_edecay': 0, 'layerwise_lr': 1, 'ds_bucket_mb': 2, 'my_sample_len': 0, 'my_ffn_shift': 1, 'my_att_shift': 1, 'head_size_a': 64, 'head_size_divisor': 8, 'my_pos_emb': 0, 'load_partial': 0, 'magic_prime': 23580569, 'my_qa_mask': 0, 'my_random_steps': 0, 'my_testing': 'x060', 'my_exit': 99999999, 'my_exit_tokens': 48293025696, 'logger': False, 'enable_checkpointing': False, 'default_root_dir': None, 'gradient_clip_val': 1.0, 'gradient_clip_algorithm': None, 'num_nodes': 1, 'num_processes': None, 'devices': '1', 'gpus': None, 'auto_select_gpus': None, 'tpu_cores': None, 'ipus': None, 'enable_progress_bar': True, 'overfit_batches': 0.0, 'track_grad_norm': -1, 'check_val_every_n_epoch': 100000000000000000000, 'fast_dev_run': False, 'accumulate_grad_batches': None, 'max_epochs': -1, 'min_epochs': None, 'max_steps': -1, 'min_steps': None, 'max_time': None, 'limit_train_batches': None, 'limit_val_batches': None, 'limit_test_batches': None, 'limit_predict_batches': None, 'val_check_interval': None, 'log_every_n_steps': 100000000000000000000, 'accelerator': 'gpu', 'strategy': 'deepspeed_stage_2', 'sync_batchnorm': False, 'precision': 'bf16', 'enable_model_summary': True, 'num_sanity_val_steps': 0, 'resume_from_checkpoint': None, 'profiler': None, 'benchmark': None, 'reload_dataloaders_every_n_epochs': 0, 'auto_lr_find': False, 'replace_sampler_ddp': False, 'detect_anomaly': False, 'auto_scale_batch_size': False, 'plugins': None, 'amp_backend': None, 'amp_level': None, 'move_metrics_to_cpu': False, 'multiple_trainloader_mode': 'max_size_cycle', 'inference_mode': True, 'grad_clip': 1.0, 'my_timestamp': '2024-12-19-11-09-20', 'betas': (0.9, 0.95), 'real_bsz': 96, 'run_name': '8000 ctx2048 L12 D768', 'my_pile_prev_p': 463} +{'zero_allow_untested_optimizer': True, 'zero_optimization': {'stage': 2, 'contiguous_gradients': True, 'overlap_comm': True, 'allgather_partitions': True, 'reduce_scatter': True, 'allgather_bucket_size': 2000000, 'reduce_bucket_size': 2000000, 'sub_group_size': 1000000000000}, 'activation_checkpointing': {'partition_activations': False, 'cpu_checkpointing': False, 'contiguous_memory_optimization': False, 'synchronize_checkpoint_boundary': False}, 'aio': {'block_size': 1048576, 'queue_depth': 8, 'single_submit': False, 'overlap_events': True, 'thread_count': 1}, 'gradient_accumulation_steps': 1, 'train_micro_batch_size_per_gpu': 96, 'gradient_clipping': 1.0, 'bf16': {'enabled': True}} +465 2.160361 8.6743 0.00005375 2024-12-19 15:01:12.753007 0 +466 2.150874 8.5924 0.00005296 2024-12-19 18:52:55.757587 1 +467 2.158538 8.6585 0.00005217 2024-12-19 22:44:37.771349 2 +468 2.147898 8.5668 0.00005140 2024-12-20 02:36:21.017293 3 +469 2.149702 8.5823 0.00005062 2024-12-20 06:28:06.163320 4 +470 2.149572 8.5812 0.00004986 2024-12-20 10:19:49.533962 5 +471 2.146875 8.5581 0.00004910 2024-12-20 14:11:43.408353 6 +472 2.147005 8.5592 0.00004834 2024-12-20 18:03:26.822179 7 +473 2.149126 8.5774 0.00004759 2024-12-20 21:55:09.437860 8 +474 2.157347 8.6482 0.00004685 2024-12-21 01:47:02.101170 9 +475 2.154315 8.6220 0.00004612 2024-12-21 05:38:53.711881 10 +476 2.150074 8.5855 0.00004538 2024-12-21 09:30:40.032347 11 +477 2.151693 8.5994 0.00004466 2024-12-21 13:22:25.373140 12 +478 2.151116 8.5944 0.00004394 2024-12-21 17:14:10.602866 13 +479 2.153199 8.6124 0.00004323 2024-12-21 21:06:00.884196 14 +480 2.149051 8.5767 0.00004252 2024-12-22 00:57:44.478412 15 +481 2.146726 8.5568 0.00004182 2024-12-22 04:49:36.585285 16 +482 2.148456 8.5716 0.00004113 2024-12-22 08:41:32.834425 17 +483 2.142783 8.5231 0.00004044 2024-12-22 12:33:15.493955 18 +484 2.148419 8.5713 0.00003975 2024-12-22 16:25:00.922796 19 +485 2.148158 8.5691 0.00003908 2024-12-22 20:16:44.150765 20 +486 2.147340 8.5621 0.00003841 2024-12-23 00:08:28.272687 21 +487 2.149740 8.5826 0.00003774 2024-12-23 04:00:12.015684 22 +488 2.139063 8.4915 0.00003709 2024-12-23 07:51:56.017206 23 +489 2.147080 8.5598 0.00003643 2024-12-23 11:43:41.014928 24 +490 2.143527 8.5295 0.00003579 2024-12-23 15:35:24.432837 25 +491 2.151842 8.6007 0.00003515 2024-12-23 19:27:07.634134 26 +492 2.145647 8.5476 0.00003452 2024-12-23 23:18:51.480389 27 +493 2.141536 8.5125 0.00003389 2024-12-24 03:10:36.449937 28 +494 2.144308 8.5361 0.00003327 2024-12-24 07:02:18.596918 29 +495 2.143899 8.5326 0.00003266 2024-12-24 10:54:05.284148 30 +496 2.145647 8.5476 0.00003205 2024-12-24 14:45:52.067904 31 +497 2.141332 8.5108 0.00003145 2024-12-24 18:37:44.511239 32 +498 2.140290 8.5019 0.00003085 2024-12-24 22:29:27.867929 33 +499 2.148121 8.5687 0.00003026 2024-12-25 02:21:11.494214 34 +500 2.145294 8.5446 0.00002968 2024-12-25 06:12:55.956263 35 +501 2.137965 8.4822 0.00002911 2024-12-25 10:04:49.128693 36 +502 2.142727 8.5226 0.00002854 2024-12-25 13:56:36.948353 37 +503 2.138951 8.4905 0.00002797 2024-12-25 17:48:36.774453 38 +504 2.145740 8.5484 0.00002742 2024-12-25 21:40:23.697855 39 +505 2.140588 8.5044 0.00002687 2024-12-26 01:32:06.271816 40 +506 2.147712 8.5652 0.00002633 2024-12-26 05:23:53.823630 41 +507 2.141183 8.5095 0.00002579 2024-12-26 09:15:48.224657 42 +508 2.141499 8.5122 0.00002526 2024-12-26 13:07:31.471699 43 +509 2.140309 8.5021 0.00002473 2024-12-26 16:59:15.188654 44 +510 2.134635 8.4540 0.00002422 2024-12-26 20:51:11.218670 45 +511 2.145871 8.5495 0.00002371 2024-12-27 00:42:54.498985 46 +512 2.136477 8.4695 0.00002320 2024-12-27 04:34:37.144143 47 +513 2.143806 8.5318 0.00002271 2024-12-27 08:26:38.219630 48 +514 2.139993 8.4994 0.00002222 2024-12-27 12:18:21.708856 49 +515 2.132161 8.4331 0.00002173 2024-12-27 16:10:03.405034 50 +516 2.144847 8.5407 0.00002125 2024-12-27 20:01:55.274763 51 +517 2.136291 8.4680 0.00002078 2024-12-27 23:53:51.145494 52 +518 2.139193 8.4926 0.00002032 2024-12-28 03:45:33.988866 53 +519 2.136607 8.4706 0.00001986 2024-12-28 07:37:16.685572 54 +520 2.142299 8.5190 0.00001941 2024-12-28 11:28:58.456320 55 +521 2.128776 8.4046 0.00001897 2024-12-28 15:20:41.118652 56 +522 2.138300 8.4850 0.00001853 2024-12-28 19:12:23.567093 57 +523 2.139211 8.4927 0.00001810 2024-12-28 23:04:05.516249 58 +524 2.139453 8.4948 0.00001768 2024-12-29 02:55:56.188651 59 +525 2.129613 8.4116 0.00001726 2024-12-29 06:47:37.815953 60 +526 2.138300 8.4850 0.00001685 2024-12-29 10:39:25.105527 61 +527 2.138597 8.4875 0.00001645 2024-12-29 14:31:07.174016 62 +528 2.139900 8.4986 0.00001605 2024-12-29 18:22:49.650640 63 +529 2.137333 8.4768 0.00001566 2024-12-29 22:14:34.394822 64 +530 2.138132 8.4836 0.00001528 2024-12-30 02:06:17.064597 65 +531 2.139230 8.4929 0.00001490 2024-12-30 05:58:13.661928 66 +532 2.136272 8.4678 0.00001454 2024-12-30 09:50:08.516375 67 +533 2.135807 8.4639 0.00001417 2024-12-30 13:41:51.381631 68 +534 2.135379 8.4603 0.00001382 2024-12-30 17:33:33.791074 69 +535 2.140737 8.5057 0.00001347 2024-12-30 21:25:27.040631 70 +536 2.131250 8.4254 0.00001313 2024-12-31 01:17:10.988979 71 +537 2.134115 8.4496 0.00001279 2024-12-31 05:08:55.684918 72 +538 2.133854 8.4474 0.00001247 2024-12-31 09:00:46.271230 73 +539 2.132087 8.4324 0.00001215 2024-12-31 12:52:42.760800 74 +540 2.134152 8.4499 0.00001183 2024-12-31 16:44:26.721768 75 +541 2.132645 8.4372 0.00001153 2024-12-31 20:36:13.789819 76 +542 2.136793 8.4722 0.00001123 2025-01-01 00:28:13.400040 77 +543 2.133240 8.4422 0.00001093 2025-01-01 04:19:56.507061 78 +544 2.136551 8.4702 0.00001065 2025-01-01 08:11:40.386391 79 +545 2.137147 8.4752 0.00001037 2025-01-01 12:03:23.297370 80 +546 2.132645 8.4372 0.00001010 2025-01-01 15:55:15.061673 81 +547 2.131659 8.4288 0.00000983 2025-01-01 19:47:00.453898 82 +548 2.129613 8.4116 0.00000957 2025-01-01 23:38:44.569019 83 +549 2.141871 8.5154 0.00000932 2025-01-02 03:30:39.424905 84 +550 2.134989 8.4570 0.00000908 2025-01-02 07:22:22.485915 85 +551 2.132701 8.4376 0.00000884 2025-01-02 11:14:06.952606 86 +552 2.140272 8.5017 0.00000861 2025-01-02 15:05:52.780331 87 +553 2.136477 8.4695 0.00000839 2025-01-02 18:57:42.382772 88 +554 2.143173 8.5265 0.00000817 2025-01-02 22:49:29.320755 89 +555 2.132664 8.4373 0.00000796 2025-01-03 02:41:17.742425 90 +556 2.128292 8.4005 0.00000776 2025-01-03 06:33:16.400378 91 +557 2.130208 8.4166 0.00000757 2025-01-03 10:25:05.692484 92 +558 2.128832 8.4050 0.00000738 2025-01-03 14:16:48.554039 93 +559 2.128069 8.3986 0.00000720 2025-01-03 18:08:36.337734 94 +560 2.138709 8.4885 0.00000703 2025-01-03 22:00:27.777289 95 +561 2.126879 8.3886 0.00000686 2025-01-04 01:52:16.602372 96 +562 2.128832 8.4050 0.00000670 2025-01-04 05:44:18.839517 97 +563 2.130041 8.4152 0.00000655 2025-01-04 09:36:00.700856 98 +564 2.134449 8.4524 0.00000640 2025-01-04 13:27:43.306351 99 +565 2.127995 8.3980 0.00000627 2025-01-04 17:19:25.640607 100 +566 2.132310 8.4343 0.00000614 2025-01-04 21:11:08.696177 101 +567 2.131436 8.4270 0.00000601 2025-01-05 01:02:51.038974 102 +568 2.132682 8.4375 0.00000590 2025-01-05 04:54:38.367126 103 +569 2.138207 8.4842 0.00000579 2025-01-05 08:46:21.911847 104 +570 2.132924 8.4395 0.00000568 2025-01-05 12:38:03.874544 105 +571 2.128274 8.4004 0.00000559 2025-01-05 16:29:56.920860 106 +572 2.129650 8.4119 0.00000550 2025-01-05 20:21:39.075777 107 +573 2.136217 8.4673 0.00000542 2025-01-06 00:13:40.915087 108 +574 2.125428 8.3765 0.00000535 2025-01-06 04:05:31.116694 109 +575 2.132124 8.4328 0.00000528 2025-01-06 07:57:13.551209 110 +576 2.135305 8.4596 0.00000522 2025-01-06 11:48:55.999996 111 +577 2.128850 8.4052 0.00000517 2025-01-06 15:40:45.224787 112 +578 2.129427 8.4100 0.00000512 2025-01-06 19:32:45.122256 113 +NEW RUN 2025-01-06-20-42-02 +{'load_model': 'out/L12-D768-x060-assholerwkv/rwkv-578.pth', 'wandb': 'tami4', 'proj_dir': 'out/L12-D768-x060-assholerwkv', 'random_seed': -1, 'train_type': '', 'data_file': 'data/pol_tokenized', 'data_type': 'binidx', 'vocab_size': 8000, 'ctx_len': 2048, 'epoch_steps': 420, 'epoch_count': 584, 'epoch_begin': 579, 'epoch_save': 1, 'micro_bsz': 96, 'n_layer': 12, 'n_embd': 768, 'dim_att': 768, 'dim_ffn': 2688, 'pre_ffn': 0, 'head_qk': 0, 'tiny_att_dim': 0, 'tiny_att_layer': -999, 'lr_init': 0.0005, 'lr_final': 5e-06, 'warmup_steps': 20, 'beta1': 0.9, 'beta2': 0.95, 'adam_eps': 1e-18, 'grad_cp': 1, 'dropout': 0, 'weight_decay': 0.1, 'weight_decay_final': -1, 'my_pile_version': 1, 'my_pile_stage': 3, 'my_pile_shift': 0, 'my_pile_edecay': 0, 'layerwise_lr': 1, 'ds_bucket_mb': 2, 'my_sample_len': 0, 'my_ffn_shift': 1, 'my_att_shift': 1, 'head_size_a': 64, 'head_size_divisor': 8, 'my_pos_emb': 0, 'load_partial': 0, 'magic_prime': 23580569, 'my_qa_mask': 0, 'my_random_steps': 0, 'my_testing': 'x060', 'my_exit': 99999999, 'my_exit_tokens': 48293025696, 'logger': False, 'enable_checkpointing': False, 'default_root_dir': None, 'gradient_clip_val': 1.0, 'gradient_clip_algorithm': None, 'num_nodes': 1, 'num_processes': None, 'devices': '1', 'gpus': None, 'auto_select_gpus': None, 'tpu_cores': None, 'ipus': None, 'enable_progress_bar': True, 'overfit_batches': 0.0, 'track_grad_norm': -1, 'check_val_every_n_epoch': 100000000000000000000, 'fast_dev_run': False, 'accumulate_grad_batches': None, 'max_epochs': -1, 'min_epochs': None, 'max_steps': -1, 'min_steps': None, 'max_time': None, 'limit_train_batches': None, 'limit_val_batches': None, 'limit_test_batches': None, 'limit_predict_batches': None, 'val_check_interval': None, 'log_every_n_steps': 100000000000000000000, 'accelerator': 'gpu', 'strategy': 'deepspeed_stage_2', 'sync_batchnorm': False, 'precision': 'bf16', 'enable_model_summary': True, 'num_sanity_val_steps': 0, 'resume_from_checkpoint': None, 'profiler': None, 'benchmark': None, 'reload_dataloaders_every_n_epochs': 0, 'auto_lr_find': False, 'replace_sampler_ddp': False, 'detect_anomaly': False, 'auto_scale_batch_size': False, 'plugins': None, 'amp_backend': None, 'amp_level': None, 'move_metrics_to_cpu': False, 'multiple_trainloader_mode': 'max_size_cycle', 'inference_mode': True, 'grad_clip': 1.0, 'my_timestamp': '2025-01-06-20-42-02', 'betas': (0.9, 0.95), 'real_bsz': 96, 'run_name': '8000 ctx2048 L12 D768', 'my_pile_prev_p': 577} +{'zero_allow_untested_optimizer': True, 'zero_optimization': {'stage': 2, 'contiguous_gradients': True, 'overlap_comm': True, 'allgather_partitions': True, 'reduce_scatter': True, 'allgather_bucket_size': 2000000, 'reduce_bucket_size': 2000000, 'sub_group_size': 1000000000000}, 'activation_checkpointing': {'partition_activations': False, 'cpu_checkpointing': False, 'contiguous_memory_optimization': False, 'synchronize_checkpoint_boundary': False}, 'aio': {'block_size': 1048576, 'queue_depth': 8, 'single_submit': False, 'overlap_events': True, 'thread_count': 1}, 'gradient_accumulation_steps': 1, 'train_micro_batch_size_per_gpu': 96, 'gradient_clipping': 1.0, 'bf16': {'enabled': True}} +579 2.131808 8.4301 0.00000508 2025-01-07 00:33:56.540676 0 +580 2.134170 8.4500 0.00000505 2025-01-07 04:25:46.800226 1 +581 2.128795 8.4047 0.00000503 2025-01-07 08:17:26.873613 2 +582 2.135826 8.4640 0.00000501 2025-01-07 12:09:02.829357 3 +583 2.130915 8.4226 0.00000500 2025-01-07 16:00:53.731388 4