diff --git a/.gitattributes b/.gitattributes index a6344aac8c09253b3b630fb776ae94478aa0275b..66015ab0e1215c94ba106413a952b3a001f25ad0 100644 --- a/.gitattributes +++ b/.gitattributes @@ -33,3 +33,259 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text *.zip filter=lfs diff=lfs merge=lfs -text *.zst filter=lfs diff=lfs merge=lfs -text *tfevents* filter=lfs diff=lfs merge=lfs -text +test_qwen_sdxl_2.png filter=lfs diff=lfs merge=lfs -text +test_qwen_sdxl_1.png filter=lfs diff=lfs merge=lfs -text +output.png filter=lfs diff=lfs merge=lfs -text +augmented_prompts.jsonl filter=lfs diff=lfs merge=lfs -text +illustrious_generated/2f0752fbb34d.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/0c60468c2f43.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/69df76e79d3a.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/93373c8aafcb.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/901637e9c9cb.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/5df8f6f96448.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/d337c665d640.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/22cff8295d2e.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/40da46f29016.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/fa5fabcf698e.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/4360dd6c5fe6.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/30359e982f49.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/6ff9f8e3bc30.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/59c147e61072.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/fa83bef79f58.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/93c913311968.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/e9dfcff21425.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/2b6a55b91a90.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/be943688a592.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/d9f2aa1ba51d.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/7a5292306239.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/52805af09a5d.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/f8866eadaa51.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/b6ea090258fa.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/eff53c185549.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/5730af2fb53b.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/0f51603809e6.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/25bc959f6502.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/3da0c01cc34b.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/8830712aa637.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/0344cd990769.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/84ead255f716.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/858fafff6c75.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/a6bb9c91d34f.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/24858dd06a26.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/cbf43ed5326a.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/69b08a21655a.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/c99fbd49f81e.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/16480c3be83f.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/0237c0d416e5.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/87b516fc3620.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/97117446b9bc.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/4969e23914e7.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/ee3d34de5c79.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/d6c9f0a9e161.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/472d6f988766.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/fe6ae3e9c893.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/76ab88233a8a.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/093fd2e1b007.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/808edae3dfdb.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/6371d5db8285.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/dbd730eadc95.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/3693124afec3.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/2168ac0ce1f3.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/0f049a4bf8e6.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/ae44ccdc53f3.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/6169e58592f5.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/2a5384439c3d.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/98d24abcd7dc.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/ea9390c9334a.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/f149cbf64f42.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/2c3877e6ad8c.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/1eb9d1a4ba7b.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/015614b1fb75.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/eb0a51a08785.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/030ff1cfe876.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/612a66962266.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/3064604370dd.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/e08f6e84dc64.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/0c2194ec9448.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/6f3c00c7fe54.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/94d5225e009d.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/13645624b916.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/852cf82eab3b.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/2fe068868319.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/5f11f3eb1c1c.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/682bbc2c61b4.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/4a928e27008c.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/3668a4618273.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/9c961fbe1f6c.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/ac13afd03cf4.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/538320539396.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/fcc811dfea64.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/7095bb63dced.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/335a4d8318dc.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/7ac893769529.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/bbd198e1056b.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/9917f32d22ba.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/fea6cb2e79bc.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/b4d7041ee056.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/cd971f942cae.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/0de268bff97d.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/37d6b0261a20.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/cdb2b548c854.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/1917fe59e561.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/2f91bb2671e5.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/d72cfefd9423.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/fb96879652d6.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/7a8dee7fe0e5.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/1e185bd19f3b.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/24a1e120693b.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/a45e381ff37c.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/f7a76637f5ff.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/5d114a44299d.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/b9bb1a4f4e90.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/ea26c25bf7aa.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/2664c4e866bc.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/eae6fa8da581.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/78718ff36e2c.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/cc21e3368371.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/4753abc59d67.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/337d3617ab33.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/9ef5c666dee6.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/4f890acc5300.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/8bca1099ef12.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/c2a3f4eaf18f.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/7779dd857b31.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/d3bd2a22b273.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/3575e7061f8b.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/cba2d736599c.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/e5eae5f51f9a.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/d48dd85a0e62.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/1009cb168f2a.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/b919ce31fadb.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/4d20b70e8708.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/8afdb58cdbe3.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/a6a03395863a.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/5bbfed62d132.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/ecec4646a402.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/2ae725c4a0d0.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/676918d623cc.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/ca26ac4bf239.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/feb2e566f26d.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/59499e6ab909.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/c5c9559835a0.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/11f86242e18b.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/7926406498df.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/3b956a9b1fa6.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/8f61e6e0ecbb.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/81b59f537d5b.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/5b858ea629d3.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/83bc95efaad6.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/7494b0a6a33c.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/a55256636f4a.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/12ddd81412fe.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/b60c16076bb4.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/9adfacdf6a05.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/5b95a8c1c1cc.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/b41ab503548d.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/150124c1381e.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/1bca56bbafcc.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/d9e560672527.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/a0629ea1fa16.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/56da4fd854cf.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/c6b905cdcfeb.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/2fa99de5fb4e.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/c3b6106b5d1d.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/92863f01c2a2.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/deab8fc10451.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/5566f4e6dbb8.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/6a30a8a30272.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/6c80e465d221.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/e0c9797bdcc4.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/66833ca0b42b.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/560118678ad5.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/aa12167e1b22.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/49af3580681e.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/31d9280e70ec.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/2afa4f25f333.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/24d8252db59e.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/75aae2a95a1c.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/8f24cf790818.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/b86b095aad9e.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/e886088f9603.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/70f4e182ad99.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/da83ff193381.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/6d0dffee930e.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/c315f64e722e.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/08a3b54b095f.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/2ad78a0f9e2a.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/c1f19445e27b.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/d979e3d32abf.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/3c1537e059f3.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/1b5b80748fe6.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/59cdbdf0892c.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/451a24867701.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/f5d412584d10.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/790a13ed888e.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/bd237c20c4a4.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/1185827620cc.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/59789c1bd20c.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/0fb0cd81d56a.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/0c9994122d65.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/a3730f6a349c.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/c8effc769399.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/83845ff7fe93.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/853f4976e742.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/5d20b65efa2c.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/daa4477a619b.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/e7153dbd46c4.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/bd7187a15eb7.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/b497d88bf627.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/6dc3596347dd.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/a96f15690715.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/7865e4e37a96.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/98cf9e81c386.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/ced077dcba6a.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/41f8c017c60b.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/b02dd2e58a3a.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/8315f76b269d.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/a599040af025.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/5137928cfeec.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/bee14f23ce18.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/35b1f3a2020f.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/b3e4152bedd8.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/cf18aa678c82.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/4e97c5bd3369.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/fc7d541ba16e.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/5e8ee0566361.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/319935ee0505.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/576c452a2186.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/fc4cfe2d8bbd.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/704c754e9c7c.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/14ae580f7b52.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/4ad3fe906431.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/fa1bc4acdc0d.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/0789069e3606.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/e8d2670c6e8a.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/20d19a4eba57.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/67c0c434f1a3.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/d563fc3ed6cf.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/e30322f764fa.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/0da6655f00bd.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/861efde50ba4.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/560933340e7c.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/6c4d377e3b51.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/e0e1f519c17b.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/f84052e8b91d.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/cc49299f0c4f.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/b7a1f3ab80ee.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/51ccc9c08464.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/ca6f517ea8fa.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/80c6d995aefc.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/1841b7d73988.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/18186cc9802c.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/e0e2270fe38c.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/0babe9636b23.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/fb837ee1983a.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/f1475a16a1cc.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/222548d84cde.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/7ea16faf601e.png filter=lfs diff=lfs merge=lfs -text +illustrious_generated/d62bf5dcbc94.png filter=lfs diff=lfs merge=lfs -text diff --git a/augmented_prompts.jsonl b/augmented_prompts.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..baae594c7c9df11f22505fc83440b4cb6e74aa91 --- /dev/null +++ b/augmented_prompts.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:efa00615fff1913fd9464166c1fb606c2b0711dbebe63d6d6d04aa4b9f680a00 +size 15886651 diff --git a/illustrious_generated/015614b1fb75.png b/illustrious_generated/015614b1fb75.png new file mode 100644 index 0000000000000000000000000000000000000000..3d619c628669ac0a6a743bb046c20f2337a72ba1 --- /dev/null +++ b/illustrious_generated/015614b1fb75.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:efa5aaa8f965d3162e2bbcd78f579c7c08f9c2d55b8fd5c990ccd898bc5cce3a +size 891422 diff --git a/illustrious_generated/0237c0d416e5.png b/illustrious_generated/0237c0d416e5.png new file mode 100644 index 0000000000000000000000000000000000000000..fef7dccfdfbcb27e2f7931487a4cf1c528482ac6 --- /dev/null +++ b/illustrious_generated/0237c0d416e5.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:621e55b030d44b874d83409e43b1edf8cff1c5d071ad95fc72ee42056453ea45 +size 1927303 diff --git a/illustrious_generated/030ff1cfe876.png b/illustrious_generated/030ff1cfe876.png new file mode 100644 index 0000000000000000000000000000000000000000..9e7707aa9b2a42e8ae01dacac609684a4f181a17 --- /dev/null +++ b/illustrious_generated/030ff1cfe876.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2a8ba50369fa2c66d3a0db5986b58a689ca85f05906073594b0474bb7f7bdf7a +size 2581184 diff --git a/illustrious_generated/0344cd990769.png b/illustrious_generated/0344cd990769.png new file mode 100644 index 0000000000000000000000000000000000000000..c81ac0de3f8107d3f362f62c42d410ae13e4b49f --- /dev/null +++ b/illustrious_generated/0344cd990769.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:195bd3dc5b58c1de16015ab4ba33172ef8f69dc4b117107116e4f25d3e792ada +size 853805 diff --git a/illustrious_generated/0789069e3606.png b/illustrious_generated/0789069e3606.png new file mode 100644 index 0000000000000000000000000000000000000000..7841531be5de662fba5fc56a85b228e9363266b2 --- /dev/null +++ b/illustrious_generated/0789069e3606.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:473d3852a2088bcb6bd8fb86fc8335bd6674bd10dbd55975c011fcee1131c551 +size 1010525 diff --git a/illustrious_generated/08a3b54b095f.png b/illustrious_generated/08a3b54b095f.png new file mode 100644 index 0000000000000000000000000000000000000000..e834d932e460314d981d96aadf4e715ca5832f09 --- /dev/null +++ b/illustrious_generated/08a3b54b095f.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9d7b0f0903907026059235e4a3075c0f7175b6283021590d149c2e0ef442a0bc +size 946068 diff --git a/illustrious_generated/093fd2e1b007.png b/illustrious_generated/093fd2e1b007.png new file mode 100644 index 0000000000000000000000000000000000000000..d0dfc5dcd7d84f7274c3ca0284757f681fb2fc14 --- /dev/null +++ b/illustrious_generated/093fd2e1b007.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a03d9ee2b4f2d0f74ccb8fb68e57dc3a90ad099df8f87710d06a37bb00760ab8 +size 2460068 diff --git a/illustrious_generated/0babe9636b23.png b/illustrious_generated/0babe9636b23.png new file mode 100644 index 0000000000000000000000000000000000000000..b1c5d59627446c05e3f7ad62f4d8010e5e128ee2 --- /dev/null +++ b/illustrious_generated/0babe9636b23.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4b5dc8b91b5ca4af84920ed543d4e955c5f2c1261ad0f657f787b58fc0274984 +size 2473522 diff --git a/illustrious_generated/0c2194ec9448.png b/illustrious_generated/0c2194ec9448.png new file mode 100644 index 0000000000000000000000000000000000000000..739d622cb3b8b544485828336d57263635ed3a5d --- /dev/null +++ b/illustrious_generated/0c2194ec9448.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ba88ebc336838c7f199efbd325e76586c898b44335f4ebfc97ea7f65e023157a +size 449449 diff --git a/illustrious_generated/0c60468c2f43.png b/illustrious_generated/0c60468c2f43.png new file mode 100644 index 0000000000000000000000000000000000000000..acac8c723d1b97eb43efbad3a7e34b8fe1f6990d --- /dev/null +++ b/illustrious_generated/0c60468c2f43.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c3e501b3a1b4fe51b3ae2c53c766fd87ad1f04ab444c3b40223270159047116e +size 1195092 diff --git a/illustrious_generated/0c9994122d65.png b/illustrious_generated/0c9994122d65.png new file mode 100644 index 0000000000000000000000000000000000000000..be00363237ec09687490ffbb7ac929fa9458b5c2 --- /dev/null +++ b/illustrious_generated/0c9994122d65.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5ee25ad48ea9c2a18c6db4b005d1ad0f691cc0864ea8bb253c64f13dfb1e8b78 +size 3862808 diff --git a/illustrious_generated/0da6655f00bd.png b/illustrious_generated/0da6655f00bd.png new file mode 100644 index 0000000000000000000000000000000000000000..fb6dbefb47544c3b3eada514ed39bba416fcf89c --- /dev/null +++ b/illustrious_generated/0da6655f00bd.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:73fcb0c3abbdb399c6297b89c24c3e20eb6c91ee7f806bea615244080365e0bc +size 3243520 diff --git a/illustrious_generated/0de268bff97d.png b/illustrious_generated/0de268bff97d.png new file mode 100644 index 0000000000000000000000000000000000000000..ffe1607914dc7472748c5ab462083858ef455190 --- /dev/null +++ b/illustrious_generated/0de268bff97d.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:99fac9010eab1fe912c74281fc62a2cc11e5f9e133449b31aecb986dca6cd336 +size 2663668 diff --git a/illustrious_generated/0f049a4bf8e6.png b/illustrious_generated/0f049a4bf8e6.png new file mode 100644 index 0000000000000000000000000000000000000000..5b5f8f648b23e08deff04f6ba6fba5af263c093f --- /dev/null +++ b/illustrious_generated/0f049a4bf8e6.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:43b7f2446cf26653da5d4ed2e46f19a1791624e59999ed1234af356da2be7f62 +size 380459 diff --git a/illustrious_generated/0f51603809e6.png b/illustrious_generated/0f51603809e6.png new file mode 100644 index 0000000000000000000000000000000000000000..f8e641d31a92303aa84bc516f7b33483b16da315 --- /dev/null +++ b/illustrious_generated/0f51603809e6.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0180fc02362df41b00dbd41a92be41bde31076531f8fa23c3725e010f2f8746a +size 1468898 diff --git a/illustrious_generated/0fb0cd81d56a.png b/illustrious_generated/0fb0cd81d56a.png new file mode 100644 index 0000000000000000000000000000000000000000..8463fa4259f65194d862862670bae2ea4e1ed291 --- /dev/null +++ b/illustrious_generated/0fb0cd81d56a.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cf8b92cc9e8f27d7fd958ceb50d81be77f486316c03d137914dd2ee8b388466d +size 822084 diff --git a/illustrious_generated/1009cb168f2a.png b/illustrious_generated/1009cb168f2a.png new file mode 100644 index 0000000000000000000000000000000000000000..b1ce816125e86e8515d3cbb926fbb4859ce873a4 --- /dev/null +++ b/illustrious_generated/1009cb168f2a.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7aeaa92e4c2638525989db9d54012fb1c6c20527fe9fa90b63320c37d4b056c4 +size 1936241 diff --git a/illustrious_generated/1185827620cc.png b/illustrious_generated/1185827620cc.png new file mode 100644 index 0000000000000000000000000000000000000000..8314cfb4cc5316794c318f5b3111548e9dda1dcb --- /dev/null +++ b/illustrious_generated/1185827620cc.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8c65eb277286fc2eaec2ba0f2551afef49a70d14f16e75fd3cd57c57e269e2c5 +size 1476541 diff --git a/illustrious_generated/11f86242e18b.png b/illustrious_generated/11f86242e18b.png new file mode 100644 index 0000000000000000000000000000000000000000..bb8e0ccff5ebef40b9aa76a8b862814c3beb66c6 --- /dev/null +++ b/illustrious_generated/11f86242e18b.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e30cffa018c1371583538f9809a3197e8a34976333db27eb03689e6b731b9b1b +size 797551 diff --git a/illustrious_generated/12ddd81412fe.png b/illustrious_generated/12ddd81412fe.png new file mode 100644 index 0000000000000000000000000000000000000000..036e15ed63b817df34f3c77b8257755313cefb57 --- /dev/null +++ b/illustrious_generated/12ddd81412fe.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3d6a125543f78358be28e90ae11c273c51b3c8db6f2debbaed212cc43c7ddc86 +size 839551 diff --git a/illustrious_generated/13645624b916.png b/illustrious_generated/13645624b916.png new file mode 100644 index 0000000000000000000000000000000000000000..87356d9f0a417c871b0374a152f5e86e7145d685 --- /dev/null +++ b/illustrious_generated/13645624b916.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d7537d9e6970d1ba9ccac07cc643691cedc55a21699895098083580540d0a031 +size 1502479 diff --git a/illustrious_generated/14ae580f7b52.png b/illustrious_generated/14ae580f7b52.png new file mode 100644 index 0000000000000000000000000000000000000000..d70b06f137b96b6331bf191d3ef998e60620ce71 --- /dev/null +++ b/illustrious_generated/14ae580f7b52.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bd1ea2e16074b76b58cd3deef71846a15a2877056e58ed27af399e4a41d6bfbb +size 531496 diff --git a/illustrious_generated/150124c1381e.png b/illustrious_generated/150124c1381e.png new file mode 100644 index 0000000000000000000000000000000000000000..26f33dc7c23db40844967eff5764cc143c2c8deb --- /dev/null +++ b/illustrious_generated/150124c1381e.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:296de1965fb14252f0e59912dd5527fc39da2fb49fcbff6018e78c9c6e2d542e +size 2803838 diff --git a/illustrious_generated/16480c3be83f.png b/illustrious_generated/16480c3be83f.png new file mode 100644 index 0000000000000000000000000000000000000000..1a20ba9c7211328db0621674ccd2e6e54766ac16 --- /dev/null +++ b/illustrious_generated/16480c3be83f.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9acb15deedb89f7f6fc2f04e3733440c7c9a9032edac3840ce612f5be60b73db +size 1495106 diff --git a/illustrious_generated/18186cc9802c.png b/illustrious_generated/18186cc9802c.png new file mode 100644 index 0000000000000000000000000000000000000000..f8c7c214cf74f2781857026a0ef6a68f1cbf0150 --- /dev/null +++ b/illustrious_generated/18186cc9802c.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:865493d9ff85c332c5eaf1ae29b8d21dbc376e510e14c1aaa22485e55b4d2964 +size 1570392 diff --git a/illustrious_generated/1841b7d73988.png b/illustrious_generated/1841b7d73988.png new file mode 100644 index 0000000000000000000000000000000000000000..9b855bb8d8c168d6a6d2f232730307340c793ce9 --- /dev/null +++ b/illustrious_generated/1841b7d73988.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7a6b3f041d65cd9b748395bd16f8e7b3b6fa3125b870d32b844008494c7f4e6f +size 1045084 diff --git a/illustrious_generated/1917fe59e561.png b/illustrious_generated/1917fe59e561.png new file mode 100644 index 0000000000000000000000000000000000000000..50c918a9b3ab1d73617baddfb4123675f1a09bc7 --- /dev/null +++ b/illustrious_generated/1917fe59e561.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:558824bddfa53ccf222ba8f64de8131c28bf2dc3133614d1b515d7f354cf4fa0 +size 4132326 diff --git a/illustrious_generated/1b5b80748fe6.png b/illustrious_generated/1b5b80748fe6.png new file mode 100644 index 0000000000000000000000000000000000000000..3f1011cf957fa117931778a3ac6a54f6fedd422f --- /dev/null +++ b/illustrious_generated/1b5b80748fe6.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dc12c8c6183b5ee03d1673d812ea315c89810e87fe42bb49e54c32c05f2362c2 +size 3023267 diff --git a/illustrious_generated/1bca56bbafcc.png b/illustrious_generated/1bca56bbafcc.png new file mode 100644 index 0000000000000000000000000000000000000000..06ae2738134b35648ada18c5e59eef6c5331bc8e --- /dev/null +++ b/illustrious_generated/1bca56bbafcc.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bd4fc8eb885deab1c4de9c20c8c9e0a94b1086a26b7f200201e7766420d13cc7 +size 881372 diff --git a/illustrious_generated/1e185bd19f3b.png b/illustrious_generated/1e185bd19f3b.png new file mode 100644 index 0000000000000000000000000000000000000000..17fa462d2cab02402b275871388413d95e7fe657 --- /dev/null +++ b/illustrious_generated/1e185bd19f3b.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5aab37f07eae1a7881e0c0acdd06475dc98894f2da8e57f79825bc62aab747fa +size 3538009 diff --git a/illustrious_generated/1eb9d1a4ba7b.png b/illustrious_generated/1eb9d1a4ba7b.png new file mode 100644 index 0000000000000000000000000000000000000000..3d7c195173f65959b1c13dbfac37eede28f44a76 --- /dev/null +++ b/illustrious_generated/1eb9d1a4ba7b.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c4a6584eeffe55a25627ae629e87e328d10a67ef0b98d069d0b7f0f942a6b62a +size 1756493 diff --git a/illustrious_generated/20d19a4eba57.png b/illustrious_generated/20d19a4eba57.png new file mode 100644 index 0000000000000000000000000000000000000000..cd50e14689493e8f24f79742885bab5b249176e1 --- /dev/null +++ b/illustrious_generated/20d19a4eba57.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c80ce861b44804b05aab6c701d73a85c54b44fc3a698533dcaea2ec4feb572eb +size 1005610 diff --git a/illustrious_generated/2168ac0ce1f3.png b/illustrious_generated/2168ac0ce1f3.png new file mode 100644 index 0000000000000000000000000000000000000000..d8eb73b32ee78d4c7d475e232fd115b4c43d842c --- /dev/null +++ b/illustrious_generated/2168ac0ce1f3.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:25971f22402fdc753b0a06134130f820cd571dd4a0ed10f4fbb4f0ac0aeaee68 +size 688597 diff --git a/illustrious_generated/222548d84cde.png b/illustrious_generated/222548d84cde.png new file mode 100644 index 0000000000000000000000000000000000000000..65ad7a106d3edda3c4d4e957e741a460b4630d2d --- /dev/null +++ b/illustrious_generated/222548d84cde.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:34489a6b29a5041bab3625646f27b08c4a706bc46d4e5787748db3ce11f4ca9c +size 3371976 diff --git a/illustrious_generated/22cff8295d2e.png b/illustrious_generated/22cff8295d2e.png new file mode 100644 index 0000000000000000000000000000000000000000..c20928b7363b44806dd048e7cfb75136f5997465 --- /dev/null +++ b/illustrious_generated/22cff8295d2e.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:99c5ba096ed4d960ea81a8ded4bd88f27d7ac6d14d77f7a49e9bdeea22b0e768 +size 1107211 diff --git a/illustrious_generated/24858dd06a26.png b/illustrious_generated/24858dd06a26.png new file mode 100644 index 0000000000000000000000000000000000000000..b2718d8748325343a19b841ab76cc559915f6d16 --- /dev/null +++ b/illustrious_generated/24858dd06a26.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:28edf42e2f23e878ac8a2aa71bda32b97d6eaed90a9c5d46188c395d027b8193 +size 1405109 diff --git a/illustrious_generated/24a1e120693b.png b/illustrious_generated/24a1e120693b.png new file mode 100644 index 0000000000000000000000000000000000000000..2b2d7e7778d228ca8b4418a632c02590fa8ba0dd --- /dev/null +++ b/illustrious_generated/24a1e120693b.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9e44a3017e9764edfda607c503a33feb7e7053814829f1f414e8352abeb0a02d +size 630660 diff --git a/illustrious_generated/24d8252db59e.png b/illustrious_generated/24d8252db59e.png new file mode 100644 index 0000000000000000000000000000000000000000..cfc1a935750b431655cfd742cbb013ff551562db --- /dev/null +++ b/illustrious_generated/24d8252db59e.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:33c90c9efecf3d8e557435f1ab6c79ff7f6973e271d288ff1387f75e2eaf57fd +size 1418719 diff --git a/illustrious_generated/25bc959f6502.png b/illustrious_generated/25bc959f6502.png new file mode 100644 index 0000000000000000000000000000000000000000..5dbf097f53ec18a7e2c56cb1baf53a06dc80804f --- /dev/null +++ b/illustrious_generated/25bc959f6502.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7c0b67685bbdb84020f3608f7684ff867a330b0308450e81546b5aa4a6a244d5 +size 2195778 diff --git a/illustrious_generated/2664c4e866bc.png b/illustrious_generated/2664c4e866bc.png new file mode 100644 index 0000000000000000000000000000000000000000..2978580a89e873e889745c2add604793236f5e70 --- /dev/null +++ b/illustrious_generated/2664c4e866bc.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f3f39ca3e4384b43d37841a6a19bd99526eae02bf5efd16b74d64fadeb9ba43c +size 2841405 diff --git a/illustrious_generated/2a5384439c3d.png b/illustrious_generated/2a5384439c3d.png new file mode 100644 index 0000000000000000000000000000000000000000..55f2c42f94b4522390518458f40d6c4fb63c048a --- /dev/null +++ b/illustrious_generated/2a5384439c3d.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e46ba0b22c53fd59ae3c075a95a20b9d063c6984a1f30bbb4efa07abd7f36ee2 +size 1466037 diff --git a/illustrious_generated/2ad78a0f9e2a.png b/illustrious_generated/2ad78a0f9e2a.png new file mode 100644 index 0000000000000000000000000000000000000000..c2f27001556fd79389e87c5cc466ff39eca6847b --- /dev/null +++ b/illustrious_generated/2ad78a0f9e2a.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cf46f3c6c9ea11ad8d64baee0c7ae3ea981018854a51aa20e212e4290095106d +size 2215231 diff --git a/illustrious_generated/2ae725c4a0d0.png b/illustrious_generated/2ae725c4a0d0.png new file mode 100644 index 0000000000000000000000000000000000000000..66a5ee0c4af5f128e10ccfdf8bf6ca3b5fe706a9 --- /dev/null +++ b/illustrious_generated/2ae725c4a0d0.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4be76749ce1ce075d61617afd104dfd7ebce134149aefd5414d3a2e7745891a0 +size 1121593 diff --git a/illustrious_generated/2afa4f25f333.png b/illustrious_generated/2afa4f25f333.png new file mode 100644 index 0000000000000000000000000000000000000000..cb639082d596ca6ad40974732334d7769e3bbe39 --- /dev/null +++ b/illustrious_generated/2afa4f25f333.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4fe3d4c104f2e07da34551aee17be5d6bbbaf37de7a66312a6f049ad38f3f311 +size 1868937 diff --git a/illustrious_generated/2b6a55b91a90.png b/illustrious_generated/2b6a55b91a90.png new file mode 100644 index 0000000000000000000000000000000000000000..6181e209b7184aa7c25177bb6e091b856996f77b --- /dev/null +++ b/illustrious_generated/2b6a55b91a90.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f3583889f3fa34b8c45207eb37c3ff565afce1875bc9f403f83868a55e64ae4d +size 2017746 diff --git a/illustrious_generated/2c3877e6ad8c.png b/illustrious_generated/2c3877e6ad8c.png new file mode 100644 index 0000000000000000000000000000000000000000..8cf5618904bb453fcdd3720a52f59678b831f6d5 --- /dev/null +++ b/illustrious_generated/2c3877e6ad8c.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d258b4adc80f99f1837d18d2e23ebb3933d9a8a77fd4113919ebea1b4c117514 +size 2573509 diff --git a/illustrious_generated/2f0752fbb34d.png b/illustrious_generated/2f0752fbb34d.png new file mode 100644 index 0000000000000000000000000000000000000000..aa5bd7d21d70c7d7ad57fa5c6423fa1a1557998b --- /dev/null +++ b/illustrious_generated/2f0752fbb34d.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5558efa081195603ababeb878d1d4b71ac5b25bc2a2d79c4db51c814a2da0f19 +size 1123775 diff --git a/illustrious_generated/2f91bb2671e5.png b/illustrious_generated/2f91bb2671e5.png new file mode 100644 index 0000000000000000000000000000000000000000..bcd6a3cb01582454e9b67352493b0e049f2ed91e --- /dev/null +++ b/illustrious_generated/2f91bb2671e5.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e2a7dc7c23c27cb4771edf9bce9399ef1e9f822f9031205dfabb8413235d3a02 +size 560295 diff --git a/illustrious_generated/2fa99de5fb4e.png b/illustrious_generated/2fa99de5fb4e.png new file mode 100644 index 0000000000000000000000000000000000000000..60d48e50e4e05682f374dd722dbc13e96852eb18 --- /dev/null +++ b/illustrious_generated/2fa99de5fb4e.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4dc902e42b8569e63f7f0f41bb594404990b0457ca2dda905f4e5bfb95119897 +size 1204167 diff --git a/illustrious_generated/2fe068868319.png b/illustrious_generated/2fe068868319.png new file mode 100644 index 0000000000000000000000000000000000000000..0e904ac47927616781c2080b3b78fd412447f4dc --- /dev/null +++ b/illustrious_generated/2fe068868319.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1366de53ecf71b9e3bd91e486fff2e9f4d1c94811158b4f02db11b477edf2843 +size 488803 diff --git a/illustrious_generated/30359e982f49.png b/illustrious_generated/30359e982f49.png new file mode 100644 index 0000000000000000000000000000000000000000..8a32707b402a253776d5e67f942f741dbd7675ae --- /dev/null +++ b/illustrious_generated/30359e982f49.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:31b9d84a6b75623dde50dc0fc28b396209928dc56d60b36d24b5c1e432d23e33 +size 2739432 diff --git a/illustrious_generated/3064604370dd.png b/illustrious_generated/3064604370dd.png new file mode 100644 index 0000000000000000000000000000000000000000..00a61c7c3ad5c2e4f85c02e7f5eb5813f6d7844a --- /dev/null +++ b/illustrious_generated/3064604370dd.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b995d74bd4500dec821ace0db5a22ea5fed00e9992fa22b5f25d7ab88683ba33 +size 1104736 diff --git a/illustrious_generated/319935ee0505.png b/illustrious_generated/319935ee0505.png new file mode 100644 index 0000000000000000000000000000000000000000..1f1fdb41bd4f5f6825a272c5a6f2fcc733fb0e76 --- /dev/null +++ b/illustrious_generated/319935ee0505.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cb1d1bcb2a25f5eb566c07141d71e7e4ed537ba9091207dfaa3d69b0c4b197b3 +size 823165 diff --git a/illustrious_generated/31d9280e70ec.png b/illustrious_generated/31d9280e70ec.png new file mode 100644 index 0000000000000000000000000000000000000000..07e78b17212d7fea9426948accf3c8e1c6d5ab87 --- /dev/null +++ b/illustrious_generated/31d9280e70ec.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7180f4cd6b112fd1ceafe26460557293e3edaa849e1edd5dec1a21d0ddd5fa26 +size 1459424 diff --git a/illustrious_generated/335a4d8318dc.png b/illustrious_generated/335a4d8318dc.png new file mode 100644 index 0000000000000000000000000000000000000000..51fb8a8250a4421b56d582f4b92dd286830a1206 --- /dev/null +++ b/illustrious_generated/335a4d8318dc.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:aa68c728c98865e8327086ab2dfa4735b98c59e10b7736fa4fd338bb7616f1e3 +size 3063905 diff --git a/illustrious_generated/337d3617ab33.png b/illustrious_generated/337d3617ab33.png new file mode 100644 index 0000000000000000000000000000000000000000..8c115e02a1bd7446455eceae4b954ebd69b256ea --- /dev/null +++ b/illustrious_generated/337d3617ab33.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a81ec29d93a92dd656d6b9e258a1fc5ffccda8918bbc3a64de2b47f814975d85 +size 829622 diff --git a/illustrious_generated/3575e7061f8b.png b/illustrious_generated/3575e7061f8b.png new file mode 100644 index 0000000000000000000000000000000000000000..5a5a4de23dfb3ecf52643b6850ec73efdd558812 --- /dev/null +++ b/illustrious_generated/3575e7061f8b.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:17ed84d27d1b14142108c3bd3b10b74e55eb9189ddf049fd2996bab7d806e971 +size 1617159 diff --git a/illustrious_generated/35b1f3a2020f.png b/illustrious_generated/35b1f3a2020f.png new file mode 100644 index 0000000000000000000000000000000000000000..fdbc59f54587049d9594f68590d108bbfbc58e5d --- /dev/null +++ b/illustrious_generated/35b1f3a2020f.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4d3cb2bb7da1008ed5b97e55b287d23cbbc75300868f20532966bf2b1c6fca92 +size 1491653 diff --git a/illustrious_generated/3668a4618273.png b/illustrious_generated/3668a4618273.png new file mode 100644 index 0000000000000000000000000000000000000000..f37dab47e7c0eb107990becdd2c20ed3f6e8934f --- /dev/null +++ b/illustrious_generated/3668a4618273.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:acd48088d1485eb9c59bfadf5beaa3279a77694462f5804e53f8a62638990e84 +size 2331972 diff --git a/illustrious_generated/3693124afec3.png b/illustrious_generated/3693124afec3.png new file mode 100644 index 0000000000000000000000000000000000000000..a8c5bb80e1020545a96b5b90bbb046be0a1bc9b6 --- /dev/null +++ b/illustrious_generated/3693124afec3.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2a0123198030208a4609dea6b5028b72b100796822ad8083ae1beace532574d2 +size 1820526 diff --git a/illustrious_generated/37d6b0261a20.png b/illustrious_generated/37d6b0261a20.png new file mode 100644 index 0000000000000000000000000000000000000000..4901c26d53113d2fa8f1f551884861186d9cab3e --- /dev/null +++ b/illustrious_generated/37d6b0261a20.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3e5b8a6ead7e48066e27b53f8b53c9a50657498dc0eb7661ba0be2ca55b4f3d3 +size 511966 diff --git a/illustrious_generated/3b956a9b1fa6.png b/illustrious_generated/3b956a9b1fa6.png new file mode 100644 index 0000000000000000000000000000000000000000..f92b7f2fada0349fbbc4d641a4401dbe44257513 --- /dev/null +++ b/illustrious_generated/3b956a9b1fa6.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b863fe16e475000ccf235f491b1ed351d3e2dbb5fa84575ad60fa031cd4b537e +size 1223636 diff --git a/illustrious_generated/3c1537e059f3.png b/illustrious_generated/3c1537e059f3.png new file mode 100644 index 0000000000000000000000000000000000000000..80e63925e04d7e940fdac397d84a9f2913b7cb7b --- /dev/null +++ b/illustrious_generated/3c1537e059f3.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:48d93a956c553d8916764ab949467574e70b60b9d7b85266581e7634b4c30017 +size 1023381 diff --git a/illustrious_generated/3da0c01cc34b.png b/illustrious_generated/3da0c01cc34b.png new file mode 100644 index 0000000000000000000000000000000000000000..e0b399115c16d625de238a1cec8ab8249ac2ed05 --- /dev/null +++ b/illustrious_generated/3da0c01cc34b.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b9da531d3a2c29e3de190c047b9aeea6cc256ecbb618b970b04cf2b1dfb34ec9 +size 2877589 diff --git a/illustrious_generated/40da46f29016.png b/illustrious_generated/40da46f29016.png new file mode 100644 index 0000000000000000000000000000000000000000..b6d87458132bd35a521d122d3d76f97fbd0e358d --- /dev/null +++ b/illustrious_generated/40da46f29016.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:41d352f4ff4c0d823f8117bd3e90ccd79384240ebd72a5fb540f011927866c76 +size 1953376 diff --git a/illustrious_generated/41f8c017c60b.png b/illustrious_generated/41f8c017c60b.png new file mode 100644 index 0000000000000000000000000000000000000000..db9d7680ff37c534284ff0a4f585bf19dbf5ca38 --- /dev/null +++ b/illustrious_generated/41f8c017c60b.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:22fd65a08d4291a70d15116613166ecf7a224e70f772b104d90675fde125018f +size 2655397 diff --git a/illustrious_generated/4360dd6c5fe6.png b/illustrious_generated/4360dd6c5fe6.png new file mode 100644 index 0000000000000000000000000000000000000000..a9ac060547ba2321fcd8956532df7b8988134fd2 --- /dev/null +++ b/illustrious_generated/4360dd6c5fe6.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:08ce8e47718ce03ba49937447c713ec343a230f315b00985efd1e758888d1355 +size 533864 diff --git a/illustrious_generated/451a24867701.png b/illustrious_generated/451a24867701.png new file mode 100644 index 0000000000000000000000000000000000000000..1e7adf70fe1605ff97ba981a26ac1483e8c65d2e --- /dev/null +++ b/illustrious_generated/451a24867701.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3d3c433819dc892d4e94860d620a1a57c37ea8a5d7cb371f7cb9f91af0b88891 +size 532025 diff --git a/illustrious_generated/472d6f988766.png b/illustrious_generated/472d6f988766.png new file mode 100644 index 0000000000000000000000000000000000000000..01d6d1d3f41257374357640cec96d6fa011a0f42 --- /dev/null +++ b/illustrious_generated/472d6f988766.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2f03396d6d8d6c31470d200c37129bfa1a3b75943fb7b6b7ce7e36ea781595d4 +size 2076133 diff --git a/illustrious_generated/4753abc59d67.png b/illustrious_generated/4753abc59d67.png new file mode 100644 index 0000000000000000000000000000000000000000..a669ab48ccf157526b31bf640bd0e81d90eca767 --- /dev/null +++ b/illustrious_generated/4753abc59d67.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e99b7b26344702dadef3335bb82ae32f134a86e37739f42e7b19e71a463726ae +size 599792 diff --git a/illustrious_generated/4969e23914e7.png b/illustrious_generated/4969e23914e7.png new file mode 100644 index 0000000000000000000000000000000000000000..f2815c5f59e3427fbd204df657c931690a1289e0 --- /dev/null +++ b/illustrious_generated/4969e23914e7.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0136b255f30a745ab300881e64b4948510ab6f572bf7a1f2584ad0753129a57b +size 1629482 diff --git a/illustrious_generated/49af3580681e.png b/illustrious_generated/49af3580681e.png new file mode 100644 index 0000000000000000000000000000000000000000..1de80f9235dcb23388d00ebc7e4ffaf41dc24f3a --- /dev/null +++ b/illustrious_generated/49af3580681e.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3531d89e6cdcdc7588f43bd97ecb88422660565fcbd346277fbd3f41378940ba +size 615455 diff --git a/illustrious_generated/4a928e27008c.png b/illustrious_generated/4a928e27008c.png new file mode 100644 index 0000000000000000000000000000000000000000..75f2e81641052de0091c9d02feabdbe854b07559 --- /dev/null +++ b/illustrious_generated/4a928e27008c.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:86b7d05c168404766a171afece288194f5cd881d3e4c528b81d25a9dcf4452fe +size 1480452 diff --git a/illustrious_generated/4ad3fe906431.png b/illustrious_generated/4ad3fe906431.png new file mode 100644 index 0000000000000000000000000000000000000000..38373357fd0a1d5fe02b7fc42db17301a6fdc58c --- /dev/null +++ b/illustrious_generated/4ad3fe906431.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8b284db2703a7b1a8c9ba531c7f9e68051c4a12fe8841575ae6f597543cb617c +size 571931 diff --git a/illustrious_generated/4d20b70e8708.png b/illustrious_generated/4d20b70e8708.png new file mode 100644 index 0000000000000000000000000000000000000000..f0481fc227bdada44a22cdf400967dff131c679a --- /dev/null +++ b/illustrious_generated/4d20b70e8708.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:feb23d1714ce56834d9e330ab5a00aa0b57a676d15df9a1c131fc998a11fa953 +size 2577980 diff --git a/illustrious_generated/4e97c5bd3369.png b/illustrious_generated/4e97c5bd3369.png new file mode 100644 index 0000000000000000000000000000000000000000..3258cbedae79a30bb721018a9ef6aeb65218cde9 --- /dev/null +++ b/illustrious_generated/4e97c5bd3369.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f559935b52bf12d4652003582b268b5b3bdf67997ef92d2a15089ec7ae3c55f7 +size 783712 diff --git a/illustrious_generated/4f890acc5300.png b/illustrious_generated/4f890acc5300.png new file mode 100644 index 0000000000000000000000000000000000000000..0a59e81547bedb9256a7c9335f28dba63a79d479 --- /dev/null +++ b/illustrious_generated/4f890acc5300.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:864d95e9983f5767c662850e8940e081e19487a24e87be4b0ed4fe03c072e19a +size 698287 diff --git a/illustrious_generated/5137928cfeec.png b/illustrious_generated/5137928cfeec.png new file mode 100644 index 0000000000000000000000000000000000000000..4571a76bbc65d8542c5f32d0b3432e9fc07fa6e1 --- /dev/null +++ b/illustrious_generated/5137928cfeec.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4f2d29451aff0f3463887451507d3e229bafbaba3b2c6fc2e674302386e5a569 +size 1322751 diff --git a/illustrious_generated/51ccc9c08464.png b/illustrious_generated/51ccc9c08464.png new file mode 100644 index 0000000000000000000000000000000000000000..8430aeb577c32144021773de014053a0015959c8 --- /dev/null +++ b/illustrious_generated/51ccc9c08464.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:470e0e26abdbaa49528b6cc0a0126d704807bd763c6002618d4c8ec8d078a877 +size 1114947 diff --git a/illustrious_generated/52805af09a5d.png b/illustrious_generated/52805af09a5d.png new file mode 100644 index 0000000000000000000000000000000000000000..f6c35dcc4d485e95e3d8f0e8c5656d9e95c019e7 --- /dev/null +++ b/illustrious_generated/52805af09a5d.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b97f726452e6cf6df8905ada4bac8b662f6f7945adbcc988214299443a1e86c9 +size 1280450 diff --git a/illustrious_generated/538320539396.png b/illustrious_generated/538320539396.png new file mode 100644 index 0000000000000000000000000000000000000000..ce6cbcd206a462e845691d2318845f50fda5471e --- /dev/null +++ b/illustrious_generated/538320539396.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9b95f410d42b01b6740e1b9c64bb937098847452e592c2394a35a93b2ba50b41 +size 4256985 diff --git a/illustrious_generated/5566f4e6dbb8.png b/illustrious_generated/5566f4e6dbb8.png new file mode 100644 index 0000000000000000000000000000000000000000..7760fa605541df0a869801aea3264ebe1079bb0c --- /dev/null +++ b/illustrious_generated/5566f4e6dbb8.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:91ca761fdc0aa6e2476a3d695f8be623d4d1aaa61637bb0f0ee7e4f7c3d4ae7d +size 1199597 diff --git a/illustrious_generated/560118678ad5.png b/illustrious_generated/560118678ad5.png new file mode 100644 index 0000000000000000000000000000000000000000..275d99812b43628c1cb7ac4ed1c14a9e775b844c --- /dev/null +++ b/illustrious_generated/560118678ad5.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ca3b93b3405d194a5c62ca9e6b759b214ae5fc026f22dd0e442a1315ea0124c7 +size 1706262 diff --git a/illustrious_generated/560933340e7c.png b/illustrious_generated/560933340e7c.png new file mode 100644 index 0000000000000000000000000000000000000000..62e27dc8c2059773171916ef6e20c8e22b8a1de7 --- /dev/null +++ b/illustrious_generated/560933340e7c.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:51024c1a7a99666ad2b3cf76e1bf7566030453722a50265a0f5c4e3b48f2f78c +size 1733745 diff --git a/illustrious_generated/56da4fd854cf.png b/illustrious_generated/56da4fd854cf.png new file mode 100644 index 0000000000000000000000000000000000000000..fc9cf5240f5a34ca8029cbb5658ea64aaa318802 --- /dev/null +++ b/illustrious_generated/56da4fd854cf.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6f1db20cf35a2febe1e7fc9442c7ce5adf47a48ac2d7cf4a16fdc7630fdfbaa1 +size 2638451 diff --git a/illustrious_generated/5730af2fb53b.png b/illustrious_generated/5730af2fb53b.png new file mode 100644 index 0000000000000000000000000000000000000000..66dbbaf7e77a81de8495f4f64823aff247f75d78 --- /dev/null +++ b/illustrious_generated/5730af2fb53b.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:58e946fda8aa9d21cf470661df20e09780957bde8b85b61c9ff0ec4405b35e8f +size 2396616 diff --git a/illustrious_generated/576c452a2186.png b/illustrious_generated/576c452a2186.png new file mode 100644 index 0000000000000000000000000000000000000000..8a7e6a81c53394b183649f61bce4a2b109679dad --- /dev/null +++ b/illustrious_generated/576c452a2186.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:475b6ffcfc39857d41dac5b84e511425a627d29aceac56fdda7d199289f87f2a +size 5209781 diff --git a/illustrious_generated/59499e6ab909.png b/illustrious_generated/59499e6ab909.png new file mode 100644 index 0000000000000000000000000000000000000000..657f5d7426fc39df3aba7cbffa4057bc57e2b833 --- /dev/null +++ b/illustrious_generated/59499e6ab909.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4f7afe8e887ed9c004e78fc144283dfc31cb028b3e2ed673259e23e7f7f526fb +size 4118507 diff --git a/illustrious_generated/59789c1bd20c.png b/illustrious_generated/59789c1bd20c.png new file mode 100644 index 0000000000000000000000000000000000000000..1325f2b5691a72c5fef1e05d49423a1d80b45abc --- /dev/null +++ b/illustrious_generated/59789c1bd20c.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:760c4ebf8ee1e6859277c5037a4d26c8dd1c89bebd08fd3537f2c94402a137f8 +size 3768619 diff --git a/illustrious_generated/59c147e61072.png b/illustrious_generated/59c147e61072.png new file mode 100644 index 0000000000000000000000000000000000000000..b47e8a9667747077d06aaebf1e640f06693fdb1f --- /dev/null +++ b/illustrious_generated/59c147e61072.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:57553611843817fc8ddda570da6fc70ad67b059df27368aa37bb4da92629c574 +size 654827 diff --git a/illustrious_generated/59cdbdf0892c.png b/illustrious_generated/59cdbdf0892c.png new file mode 100644 index 0000000000000000000000000000000000000000..de68f37b952422ad82f08c751be24ece982a32e5 --- /dev/null +++ b/illustrious_generated/59cdbdf0892c.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a7e8ce1e98cfe444a130f6a33cde4cc9ed9f7b77ff940492ff0ccd61090ebe2f +size 1425365 diff --git a/illustrious_generated/5b858ea629d3.png b/illustrious_generated/5b858ea629d3.png new file mode 100644 index 0000000000000000000000000000000000000000..97758eb1d33f94f557b4a0a79aa43917057b626a --- /dev/null +++ b/illustrious_generated/5b858ea629d3.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:84e9c9b8b6f560b482394978c375017b225e2f6717cb985d034a780914f32a9b +size 4192436 diff --git a/illustrious_generated/5b95a8c1c1cc.png b/illustrious_generated/5b95a8c1c1cc.png new file mode 100644 index 0000000000000000000000000000000000000000..1279b9e3541a64155aeda7c3f1b3fd9c59c66a56 --- /dev/null +++ b/illustrious_generated/5b95a8c1c1cc.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9d08d1f27e26850e673d90c617a9d807cb410242f0e46d231e6fce67bd0ac6fc +size 352388 diff --git a/illustrious_generated/5bbfed62d132.png b/illustrious_generated/5bbfed62d132.png new file mode 100644 index 0000000000000000000000000000000000000000..cc12c026b6f6b33e48ab691628a2d86794a9e175 --- /dev/null +++ b/illustrious_generated/5bbfed62d132.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:233fe734a2b6bc1ab1312e746efdfa3217a44b9c85e1e70e8a08917bb32e73c9 +size 1181694 diff --git a/illustrious_generated/5d114a44299d.png b/illustrious_generated/5d114a44299d.png new file mode 100644 index 0000000000000000000000000000000000000000..da8e53aed92e9728c6bdadb9f9e597f838aa4028 --- /dev/null +++ b/illustrious_generated/5d114a44299d.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a13a48ddcc4904da98789a736d861b5a30e5074e6236339adfab1555d958d277 +size 502053 diff --git a/illustrious_generated/5d20b65efa2c.png b/illustrious_generated/5d20b65efa2c.png new file mode 100644 index 0000000000000000000000000000000000000000..e4c362f56423033628c88f614cbb33176e8a4968 --- /dev/null +++ b/illustrious_generated/5d20b65efa2c.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b43ee179ffcb2ea903441dbe853d46a31e58d9d6b0815e122f5bc37ed94220e9 +size 712207 diff --git a/illustrious_generated/5df8f6f96448.png b/illustrious_generated/5df8f6f96448.png new file mode 100644 index 0000000000000000000000000000000000000000..22a91c97dcade047f433b2a04f732a515f2501b7 --- /dev/null +++ b/illustrious_generated/5df8f6f96448.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e6a95c8b2846c03926ef35a851ce99b2f96e0c7c38805f54024216acd50fe51b +size 1485260 diff --git a/illustrious_generated/5e8ee0566361.png b/illustrious_generated/5e8ee0566361.png new file mode 100644 index 0000000000000000000000000000000000000000..01e65e549304d2b9552ddb9facfcd809a0ece953 --- /dev/null +++ b/illustrious_generated/5e8ee0566361.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:38e30af864d263b01a61b1c4344835ce0dd62e57cea6e9a0a1a04f4a6918fe9d +size 1924713 diff --git a/illustrious_generated/5f11f3eb1c1c.png b/illustrious_generated/5f11f3eb1c1c.png new file mode 100644 index 0000000000000000000000000000000000000000..0ce7aba5142926017029ef5b7519e7bf169ef195 --- /dev/null +++ b/illustrious_generated/5f11f3eb1c1c.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6d0fd87ccc1d4ca9224da80a1fbd87939140c88c2369269621fcee0e15707ad1 +size 364213 diff --git a/illustrious_generated/612a66962266.png b/illustrious_generated/612a66962266.png new file mode 100644 index 0000000000000000000000000000000000000000..51307918a7ee7a1527cabeafee6f162089fc7fe7 --- /dev/null +++ b/illustrious_generated/612a66962266.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:591ffc99c217468a34356c002f0362b45ee792e3e879f204ff2d015263c74f78 +size 2221382 diff --git a/illustrious_generated/6169e58592f5.png b/illustrious_generated/6169e58592f5.png new file mode 100644 index 0000000000000000000000000000000000000000..eeaff3d5c97ad6528832944b5cb2cf72f5d849d6 --- /dev/null +++ b/illustrious_generated/6169e58592f5.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b5c87de41991c6de7bdc77b888f29298af113dcb3c5c007f1336341afa085e21 +size 2293037 diff --git a/illustrious_generated/6371d5db8285.png b/illustrious_generated/6371d5db8285.png new file mode 100644 index 0000000000000000000000000000000000000000..13165f1ef503e67f8e616dbf5a32725716c8feb5 --- /dev/null +++ b/illustrious_generated/6371d5db8285.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6dea32a916eeeb28ba962f02629b232b850a03145703f00c2d4104c63abe80c1 +size 900133 diff --git a/illustrious_generated/66833ca0b42b.png b/illustrious_generated/66833ca0b42b.png new file mode 100644 index 0000000000000000000000000000000000000000..16c1e398780df72e51bfe8afc497f7a493346578 --- /dev/null +++ b/illustrious_generated/66833ca0b42b.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bd6b42c7ff4a0be9b9888d0a2250ae8bedea206cfa78c05e9005891781b2e80c +size 2615138 diff --git a/illustrious_generated/676918d623cc.png b/illustrious_generated/676918d623cc.png new file mode 100644 index 0000000000000000000000000000000000000000..f2dabf0d38851ce5544a8dbb80f1facec869324b --- /dev/null +++ b/illustrious_generated/676918d623cc.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4365ce9a284a0b723db53be97cb5efbd1efc2e0934a22b582d558126bafc0cf3 +size 2854638 diff --git a/illustrious_generated/67c0c434f1a3.png b/illustrious_generated/67c0c434f1a3.png new file mode 100644 index 0000000000000000000000000000000000000000..fd7e9d1832ff7a9284524d443d23ce7880de857e --- /dev/null +++ b/illustrious_generated/67c0c434f1a3.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:69cd38711681a86a67cbcfc6dc34ca094ad9290dadd313041f2543455a3806c5 +size 978153 diff --git a/illustrious_generated/682bbc2c61b4.png b/illustrious_generated/682bbc2c61b4.png new file mode 100644 index 0000000000000000000000000000000000000000..183d04d71f9d34bbc55553cf4f63dc1841b3a7a4 --- /dev/null +++ b/illustrious_generated/682bbc2c61b4.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5e45dc32c53fd6428b388bc491df7a0a2c5e974053590f9e51972a75795b75cb +size 563863 diff --git a/illustrious_generated/69b08a21655a.png b/illustrious_generated/69b08a21655a.png new file mode 100644 index 0000000000000000000000000000000000000000..fe31ef88bbcfd8121b6cba48e820c20339dabeb3 --- /dev/null +++ b/illustrious_generated/69b08a21655a.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:da301be5abbbb96e600682973fe4741a14bf85f8ee30368791e9fce0ba0e11bc +size 932497 diff --git a/illustrious_generated/69df76e79d3a.png b/illustrious_generated/69df76e79d3a.png new file mode 100644 index 0000000000000000000000000000000000000000..d25b712ea0222485ba219f93a23bbf0311206bbe --- /dev/null +++ b/illustrious_generated/69df76e79d3a.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a00541631df0cd0c880678237fe09a00eb72cc05971216f29db21d5e668ab178 +size 611014 diff --git a/illustrious_generated/6a30a8a30272.png b/illustrious_generated/6a30a8a30272.png new file mode 100644 index 0000000000000000000000000000000000000000..08e4abbc8fdec1920f01b3a449220c690604bae3 --- /dev/null +++ b/illustrious_generated/6a30a8a30272.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8bc651f2ceca8260645971314e9bb2559d7a8f2b2a66f73b7b6b0d14024574d9 +size 2122280 diff --git a/illustrious_generated/6c4d377e3b51.png b/illustrious_generated/6c4d377e3b51.png new file mode 100644 index 0000000000000000000000000000000000000000..323aa6b4373c5e60479ba402e07268f05ab1597f --- /dev/null +++ b/illustrious_generated/6c4d377e3b51.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7a748183e298936fdde5358e9a1dc5b845395e9a00b383f8b4a9d8f859eea0b2 +size 2519554 diff --git a/illustrious_generated/6c80e465d221.png b/illustrious_generated/6c80e465d221.png new file mode 100644 index 0000000000000000000000000000000000000000..4d3995827b914e0ec9840ed3b9ef31d7702e21dc --- /dev/null +++ b/illustrious_generated/6c80e465d221.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:75c8ca474b50d0b50cea87963c191b493cb92eb5109af7290ac50931728ab44a +size 2424732 diff --git a/illustrious_generated/6d0dffee930e.png b/illustrious_generated/6d0dffee930e.png new file mode 100644 index 0000000000000000000000000000000000000000..eceb45579be6630d8c593bc1ac0145a1236b2c6b --- /dev/null +++ b/illustrious_generated/6d0dffee930e.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6eb4582ddebd0c4e7b97ed49c2c1e0139bac8fc8c3a5cbc9a0d57e20a5f5ed4d +size 2369513 diff --git a/illustrious_generated/6dc3596347dd.png b/illustrious_generated/6dc3596347dd.png new file mode 100644 index 0000000000000000000000000000000000000000..4bc5a141a02854db0a8907aa77365fd6515ceea9 --- /dev/null +++ b/illustrious_generated/6dc3596347dd.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f58f61eb4f11cf8a7615f24afb6f4c4fbb17ac0e39ab38583abff5d175803a81 +size 543206 diff --git a/illustrious_generated/6f3c00c7fe54.png b/illustrious_generated/6f3c00c7fe54.png new file mode 100644 index 0000000000000000000000000000000000000000..c55f960b7401d4f27047cb494f6926613a859d26 --- /dev/null +++ b/illustrious_generated/6f3c00c7fe54.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:143a7b24e9f81311ff0ee0ecce7c6756ca0026b7a1f7414379261c6b4e358bb6 +size 799232 diff --git a/illustrious_generated/6ff9f8e3bc30.png b/illustrious_generated/6ff9f8e3bc30.png new file mode 100644 index 0000000000000000000000000000000000000000..75f26b6bd8dc7a1c772abcff1996aa8aaabe1a62 --- /dev/null +++ b/illustrious_generated/6ff9f8e3bc30.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f3fdd91a6dc2e10aff277439074b606519988ae461fb55d1f1a66bedede46782 +size 524233 diff --git a/illustrious_generated/704c754e9c7c.png b/illustrious_generated/704c754e9c7c.png new file mode 100644 index 0000000000000000000000000000000000000000..eb74c7b2943630d1753fe60bf0d078c00de671f7 --- /dev/null +++ b/illustrious_generated/704c754e9c7c.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:332abf9b79b5b4b99428aaf21f9f946bed0cea3183176c6a05d2054d2377d1be +size 4847577 diff --git a/illustrious_generated/7095bb63dced.png b/illustrious_generated/7095bb63dced.png new file mode 100644 index 0000000000000000000000000000000000000000..d75d96cdf0345c5da1cdd2aa6c3016e91ed09255 --- /dev/null +++ b/illustrious_generated/7095bb63dced.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1c4bb050e9044127fd7fd67f69da0440223022ebf4211323c8e364c92a3e7077 +size 1059962 diff --git a/illustrious_generated/70f4e182ad99.png b/illustrious_generated/70f4e182ad99.png new file mode 100644 index 0000000000000000000000000000000000000000..9e634d798b118123655cdb4d70cf9cf7b4412b5b --- /dev/null +++ b/illustrious_generated/70f4e182ad99.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a21f6d7f23df96dffb648e9f0999b0dd07bac4da5ca0b6bca382513a14425eec +size 990030 diff --git a/illustrious_generated/7494b0a6a33c.png b/illustrious_generated/7494b0a6a33c.png new file mode 100644 index 0000000000000000000000000000000000000000..fbeb4315f5f100853b47fdbcb47652e6cdae03e0 --- /dev/null +++ b/illustrious_generated/7494b0a6a33c.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3406a7b316af7b0172b27d17db0d0671b73d25604b55ef5a93b5b80d074bc855 +size 1783488 diff --git a/illustrious_generated/75aae2a95a1c.png b/illustrious_generated/75aae2a95a1c.png new file mode 100644 index 0000000000000000000000000000000000000000..48127f5939b66960793dd346907653fd35aea675 --- /dev/null +++ b/illustrious_generated/75aae2a95a1c.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9bdca5ad9abbfde5a6ba857b3b4da18a2e37928ab0c8cb9752a77247a2b357dc +size 810669 diff --git a/illustrious_generated/76ab88233a8a.png b/illustrious_generated/76ab88233a8a.png new file mode 100644 index 0000000000000000000000000000000000000000..2a7b23fed63e8d6db704b132d805226b1a7134f6 --- /dev/null +++ b/illustrious_generated/76ab88233a8a.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:51e64f2f7f936878e26c831d8ff5d8fca1537d663c5544abcf4ccf4242fd813f +size 4849309 diff --git a/illustrious_generated/7779dd857b31.png b/illustrious_generated/7779dd857b31.png new file mode 100644 index 0000000000000000000000000000000000000000..10cf9e81b580ceae59b7e650b1327d541b3d1019 --- /dev/null +++ b/illustrious_generated/7779dd857b31.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:03ac7991b25c173d7b6f932046435c0c22cfd0bba5cffb95826e2d9187edfe10 +size 930957 diff --git a/illustrious_generated/7865e4e37a96.png b/illustrious_generated/7865e4e37a96.png new file mode 100644 index 0000000000000000000000000000000000000000..9f6da41b75735d1c320f92f2a6abb73e42b06d23 --- /dev/null +++ b/illustrious_generated/7865e4e37a96.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d94565c3370c358351dbd0a5825c75eb0cd109eb4e756d0b0560fd33c924f6d4 +size 934645 diff --git a/illustrious_generated/78718ff36e2c.png b/illustrious_generated/78718ff36e2c.png new file mode 100644 index 0000000000000000000000000000000000000000..bcd6d9cf44301147e8795ab0329e4168e37c7a1f --- /dev/null +++ b/illustrious_generated/78718ff36e2c.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5bbe46beb6902746ee852a907405285ebe75e9a0d74db007d6f409fa7290b47d +size 1008073 diff --git a/illustrious_generated/790a13ed888e.png b/illustrious_generated/790a13ed888e.png new file mode 100644 index 0000000000000000000000000000000000000000..1d6c1775229c9eb9d01edabef3799c48f9860dc7 --- /dev/null +++ b/illustrious_generated/790a13ed888e.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:53116969771b63ec90a4bc8a72673960ec389a4af0ddd2120185adc2049b3fd4 +size 890996 diff --git a/illustrious_generated/7926406498df.png b/illustrious_generated/7926406498df.png new file mode 100644 index 0000000000000000000000000000000000000000..fff6a8cdc232e0614e709f90218feb61df54c43a --- /dev/null +++ b/illustrious_generated/7926406498df.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5b04d0ccfb51b87a02f9f7db4cb10549c72d124f2051ef0dfa66d24f659adc4c +size 1370584 diff --git a/illustrious_generated/7a5292306239.png b/illustrious_generated/7a5292306239.png new file mode 100644 index 0000000000000000000000000000000000000000..e9b06f2a1e856c84bdbeac43c6af84595760a69d --- /dev/null +++ b/illustrious_generated/7a5292306239.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e36145da885c1b0d513d853ef5be855dac1a3fa1979a8cb7dacdd1f89337db12 +size 2023998 diff --git a/illustrious_generated/7a8dee7fe0e5.png b/illustrious_generated/7a8dee7fe0e5.png new file mode 100644 index 0000000000000000000000000000000000000000..faeb2d5891a5ad37105f25d1753baafbd2663fcf --- /dev/null +++ b/illustrious_generated/7a8dee7fe0e5.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2af457be0a073d736b0738385c7a989a86b7fe95128be8600c5f059e8dcccb3b +size 2469180 diff --git a/illustrious_generated/7ac893769529.png b/illustrious_generated/7ac893769529.png new file mode 100644 index 0000000000000000000000000000000000000000..1c4bec9f36425394df2814ed661fde4c40598cc4 --- /dev/null +++ b/illustrious_generated/7ac893769529.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d01e8f93e32d887941b16cdbd4947d47bfb91318849dde9fc4dfb35bac448a36 +size 1691748 diff --git a/illustrious_generated/7ea16faf601e.png b/illustrious_generated/7ea16faf601e.png new file mode 100644 index 0000000000000000000000000000000000000000..9172d3057f8070b1c4f5b332f43a3dd66f35d92d --- /dev/null +++ b/illustrious_generated/7ea16faf601e.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:812b4e9b277d966951ec235f9e120a7de597cf97e20a73fa313a1cfd5feeff59 +size 2546195 diff --git a/illustrious_generated/808edae3dfdb.png b/illustrious_generated/808edae3dfdb.png new file mode 100644 index 0000000000000000000000000000000000000000..5e7c2179948b0ac797b2104fa0a430677528f3ea --- /dev/null +++ b/illustrious_generated/808edae3dfdb.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:80d4f32df392c6f31db307be1e7e91466baa3920600f567281bf04049e5fcebd +size 1438884 diff --git a/illustrious_generated/80c6d995aefc.png b/illustrious_generated/80c6d995aefc.png new file mode 100644 index 0000000000000000000000000000000000000000..c2f693459618e896cccc1a385e1ed3a6e14a3816 --- /dev/null +++ b/illustrious_generated/80c6d995aefc.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d355eff5dd86096b325918dd6ef5d02104e8e40ad0e2ad51e99d96fc10320671 +size 2617683 diff --git a/illustrious_generated/81b59f537d5b.png b/illustrious_generated/81b59f537d5b.png new file mode 100644 index 0000000000000000000000000000000000000000..b539c6aca5a7613aa06447eac9680fa11dd298a9 --- /dev/null +++ b/illustrious_generated/81b59f537d5b.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f813ed979b5ace401ea87bfe75e6443611c1bf7046184136ef9035517ba99824 +size 1966965 diff --git a/illustrious_generated/8315f76b269d.png b/illustrious_generated/8315f76b269d.png new file mode 100644 index 0000000000000000000000000000000000000000..ed3d14e3dae88e7defe0e4b33d0e50efc9536998 --- /dev/null +++ b/illustrious_generated/8315f76b269d.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b7bf446cb0c13d256b70597dac6f32511850d85c1a681ff00ecb8bd0479c7d26 +size 1121733 diff --git a/illustrious_generated/83845ff7fe93.png b/illustrious_generated/83845ff7fe93.png new file mode 100644 index 0000000000000000000000000000000000000000..01fbcc5d5fd6a54f6842e2f3104dcc13b6c9f734 --- /dev/null +++ b/illustrious_generated/83845ff7fe93.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c567a6b6df52a3e6ae906bcc64595aa023ba0793d1ddae5a3289b23752192073 +size 1413347 diff --git a/illustrious_generated/83bc95efaad6.png b/illustrious_generated/83bc95efaad6.png new file mode 100644 index 0000000000000000000000000000000000000000..376b34d357182223b3de16eb90e4774a21ee1d89 --- /dev/null +++ b/illustrious_generated/83bc95efaad6.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9f24c1b0136144626d0f3d2427f6247a8c85647b4d174ec1adba56b9566004c9 +size 1096315 diff --git a/illustrious_generated/84ead255f716.png b/illustrious_generated/84ead255f716.png new file mode 100644 index 0000000000000000000000000000000000000000..b12a8a50c053bb2f4d851d699cc5f525ffc0433c --- /dev/null +++ b/illustrious_generated/84ead255f716.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f2d50f33aecc1e92a477f70fb353a9a7b325f8f98eba24a1eb6d50214a70836d +size 1272028 diff --git a/illustrious_generated/852cf82eab3b.png b/illustrious_generated/852cf82eab3b.png new file mode 100644 index 0000000000000000000000000000000000000000..db45aa159eaf04f0ef90805e4e03c9b3e29b9629 --- /dev/null +++ b/illustrious_generated/852cf82eab3b.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:906273e37be24fb55acaa43b7fd6e69cb7bf6ee46696376ff482a415d8bb2fe5 +size 1965118 diff --git a/illustrious_generated/853f4976e742.png b/illustrious_generated/853f4976e742.png new file mode 100644 index 0000000000000000000000000000000000000000..cf28ff7e4a42ce14db9267b72712257febb41791 --- /dev/null +++ b/illustrious_generated/853f4976e742.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a542e49d16dabb0d6eafa851ac022e802de004dd331e3b55b9808250528cfb9a +size 3016029 diff --git a/illustrious_generated/858fafff6c75.png b/illustrious_generated/858fafff6c75.png new file mode 100644 index 0000000000000000000000000000000000000000..2e1ec7912f8e6c04cff7e52e5e80b7acdd070d08 --- /dev/null +++ b/illustrious_generated/858fafff6c75.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d90c5ba71730c55da6202e7c1ae588526dfcd55a46ce26b88a7c990c32436032 +size 1248100 diff --git a/illustrious_generated/861efde50ba4.png b/illustrious_generated/861efde50ba4.png new file mode 100644 index 0000000000000000000000000000000000000000..a782fb2a8f132ffa496bf2a520e8b4a4c444640c --- /dev/null +++ b/illustrious_generated/861efde50ba4.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0fb9503aaf02f044d079829ddf07f83170f57c63d9ae3321053809e02fd00743 +size 4691790 diff --git a/illustrious_generated/87b516fc3620.png b/illustrious_generated/87b516fc3620.png new file mode 100644 index 0000000000000000000000000000000000000000..154b2755d513cc7dedb3b7951a3a485e3dadec22 --- /dev/null +++ b/illustrious_generated/87b516fc3620.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5b9d1f7dd86d46a0bfb62adb3481d91a6ed56d0c7a773ce86db6f645e469893a +size 1537914 diff --git a/illustrious_generated/8830712aa637.png b/illustrious_generated/8830712aa637.png new file mode 100644 index 0000000000000000000000000000000000000000..1839ce8f7e33678500b2db153a0488a3a0ee15c0 --- /dev/null +++ b/illustrious_generated/8830712aa637.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:09902a62fc7ecf7fbce43f0a6c77bcc9afab422ca51bf4c8431c0bc07096b2c4 +size 752458 diff --git a/illustrious_generated/8afdb58cdbe3.png b/illustrious_generated/8afdb58cdbe3.png new file mode 100644 index 0000000000000000000000000000000000000000..5cd0e1fb362e9f3dc75c98ebf2c57825bb47ade8 --- /dev/null +++ b/illustrious_generated/8afdb58cdbe3.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d265f5ce5471e08281ce33c50a7bfb36039cd2316cb4b76758e9590efefde6b7 +size 3690380 diff --git a/illustrious_generated/8bca1099ef12.png b/illustrious_generated/8bca1099ef12.png new file mode 100644 index 0000000000000000000000000000000000000000..33e9e942db91ec9b1f3fd24185613613025a07e0 --- /dev/null +++ b/illustrious_generated/8bca1099ef12.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7e3d8b3fbfa283247abfffcccb5b52b6f0fd62c5e07dd26fb67890c35a36a961 +size 3419557 diff --git a/illustrious_generated/8f24cf790818.png b/illustrious_generated/8f24cf790818.png new file mode 100644 index 0000000000000000000000000000000000000000..ff084917116eda36839f177b3575fb8c656a6b74 --- /dev/null +++ b/illustrious_generated/8f24cf790818.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9c4d207666e4f61a0379f043747af3a7ae224644d11d3e6143436b823cac4491 +size 4372413 diff --git a/illustrious_generated/8f61e6e0ecbb.png b/illustrious_generated/8f61e6e0ecbb.png new file mode 100644 index 0000000000000000000000000000000000000000..f02dbcd9837aeeb108d1082422b9ff12da7f509d --- /dev/null +++ b/illustrious_generated/8f61e6e0ecbb.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dadad6a45586e20304d4add778231c4168bbd89ebbeda23baf0f30d803bd0be0 +size 482272 diff --git a/illustrious_generated/901637e9c9cb.png b/illustrious_generated/901637e9c9cb.png new file mode 100644 index 0000000000000000000000000000000000000000..8aa4b0d1b4dcb250c8e68a5902d1b547f59c14a9 --- /dev/null +++ b/illustrious_generated/901637e9c9cb.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d3aa8a8382ac30d0775f79e83633c981f17abe637d268317ec3184d1fbbd9059 +size 443510 diff --git a/illustrious_generated/92863f01c2a2.png b/illustrious_generated/92863f01c2a2.png new file mode 100644 index 0000000000000000000000000000000000000000..9e78b39b05d85c462aac1e43e34004e26950ff3e --- /dev/null +++ b/illustrious_generated/92863f01c2a2.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ecdbc9f20b612d3f287b2827d9ea29be40558382a873525a826338b5d746ca0b +size 811271 diff --git a/illustrious_generated/93373c8aafcb.png b/illustrious_generated/93373c8aafcb.png new file mode 100644 index 0000000000000000000000000000000000000000..0aae80378d284ff4ff1d5e825f1a7f7d8b9fd22c --- /dev/null +++ b/illustrious_generated/93373c8aafcb.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c63c22dfc6ef7c01acbb2458dc224dc1286ca5105e0601f25a5831c64bca8a73 +size 1275546 diff --git a/illustrious_generated/93c913311968.png b/illustrious_generated/93c913311968.png new file mode 100644 index 0000000000000000000000000000000000000000..ca82c3dc92f758a6955824ee9f86aa77856da37b --- /dev/null +++ b/illustrious_generated/93c913311968.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6bc691e1d3b4b13363c176b4d1190a4cad176645b14099631fdc18fbd1892cef +size 1016982 diff --git a/illustrious_generated/94d5225e009d.png b/illustrious_generated/94d5225e009d.png new file mode 100644 index 0000000000000000000000000000000000000000..fedb0c6f791fc0b2424378110f7f245bc026c1d5 --- /dev/null +++ b/illustrious_generated/94d5225e009d.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4abfaab3e5b6b1b3391b043fbda80f848f6e0ea6fc4d84562b1db2a941b248d8 +size 2954576 diff --git a/illustrious_generated/97117446b9bc.png b/illustrious_generated/97117446b9bc.png new file mode 100644 index 0000000000000000000000000000000000000000..025fe57d76ae45ebf5496f42d6705675357c8efe --- /dev/null +++ b/illustrious_generated/97117446b9bc.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:82fb7e2b06b1111fc7eb99cbfdabb596ea125f1431ac8328cf9c8a654a43f455 +size 1643211 diff --git a/illustrious_generated/98cf9e81c386.png b/illustrious_generated/98cf9e81c386.png new file mode 100644 index 0000000000000000000000000000000000000000..3f5cc878ff892b5923962733ed74c0d90f5c1849 --- /dev/null +++ b/illustrious_generated/98cf9e81c386.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b5d7641e95f799fb6d86121b1b7064352ab7f50c09357a7e2b55e022f1371044 +size 3274049 diff --git a/illustrious_generated/98d24abcd7dc.png b/illustrious_generated/98d24abcd7dc.png new file mode 100644 index 0000000000000000000000000000000000000000..793f6bb5e8888a0867dea115e57c21f16dc9c84c --- /dev/null +++ b/illustrious_generated/98d24abcd7dc.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:43c33f68e169b774b3085d982430600cfaec13eb84b99ada87643cc1bb04b404 +size 1766134 diff --git a/illustrious_generated/9917f32d22ba.png b/illustrious_generated/9917f32d22ba.png new file mode 100644 index 0000000000000000000000000000000000000000..06b4f142953da800c18db71f0002b23590aa740a --- /dev/null +++ b/illustrious_generated/9917f32d22ba.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:24903b568cd4703cff56d365b01a1f24dcead2089dc839f0c6ecb81573bcd294 +size 714031 diff --git a/illustrious_generated/9adfacdf6a05.png b/illustrious_generated/9adfacdf6a05.png new file mode 100644 index 0000000000000000000000000000000000000000..a031dc64fa353e81f8a139e21472341144ab72f6 --- /dev/null +++ b/illustrious_generated/9adfacdf6a05.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a22b6a3cf7d175ff2b85efcc46a01e5ba6fd0a75bbad8c5dde16ac6967244285 +size 3492862 diff --git a/illustrious_generated/9c961fbe1f6c.png b/illustrious_generated/9c961fbe1f6c.png new file mode 100644 index 0000000000000000000000000000000000000000..0df97e4e885e387ea8609e3fd617da8b1be12e15 --- /dev/null +++ b/illustrious_generated/9c961fbe1f6c.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:50cc2cd5929003efacff497d9f104089f56877370cbeacafb3c665aaa3e479e1 +size 904520 diff --git a/illustrious_generated/9ef5c666dee6.png b/illustrious_generated/9ef5c666dee6.png new file mode 100644 index 0000000000000000000000000000000000000000..382790a4035268ac62b4bb0d4f55bda4b2a12485 --- /dev/null +++ b/illustrious_generated/9ef5c666dee6.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3d4fe23804d1d63141997d60947d84806e9b191f6952172526d44356db209b9d +size 618321 diff --git a/illustrious_generated/a0629ea1fa16.png b/illustrious_generated/a0629ea1fa16.png new file mode 100644 index 0000000000000000000000000000000000000000..139c2f3ecff27e1e8af9ba033ce24be633c29f93 --- /dev/null +++ b/illustrious_generated/a0629ea1fa16.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e97532db5731199363403fe57ad08c0a3a93f0381197e21094a2ab2357d5a8a7 +size 2214943 diff --git a/illustrious_generated/a3730f6a349c.png b/illustrious_generated/a3730f6a349c.png new file mode 100644 index 0000000000000000000000000000000000000000..10ba5e04657fd7fb8813a4b4f3b39d4eaea3699e --- /dev/null +++ b/illustrious_generated/a3730f6a349c.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d4a0233b252af623d561745bec29b20c8626a3ae8133f22bbd6a19fff051a9e0 +size 598428 diff --git a/illustrious_generated/a45e381ff37c.png b/illustrious_generated/a45e381ff37c.png new file mode 100644 index 0000000000000000000000000000000000000000..8a85e14d65e849891fefe885e153841d84b4e8cd --- /dev/null +++ b/illustrious_generated/a45e381ff37c.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:250a35bc8186bf989af53c9f661ab0eb09ddcf4a8c3dfe3f74b21600c18c70a6 +size 3321314 diff --git a/illustrious_generated/a55256636f4a.png b/illustrious_generated/a55256636f4a.png new file mode 100644 index 0000000000000000000000000000000000000000..d1c44ff85fd2beb43778bb05aedf0119c00084ce --- /dev/null +++ b/illustrious_generated/a55256636f4a.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a3113c4ad71d3d3f54105497351fd99aeee4b966503223ec7617127eb50f5740 +size 5309258 diff --git a/illustrious_generated/a599040af025.png b/illustrious_generated/a599040af025.png new file mode 100644 index 0000000000000000000000000000000000000000..8456528ae0dc0468d147366066a1de1956f419f9 --- /dev/null +++ b/illustrious_generated/a599040af025.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2b961de0d504071c55c34ccc0d3cf1955600ceea3f7f13f86289227a02ce7418 +size 4743579 diff --git a/illustrious_generated/a6a03395863a.png b/illustrious_generated/a6a03395863a.png new file mode 100644 index 0000000000000000000000000000000000000000..7bbf2a818f2855d299237658b35e97d1bbe1811e --- /dev/null +++ b/illustrious_generated/a6a03395863a.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5da4d4cdb78f963bed65112bbf9048cb90aefcb50831684e0605321db5275f80 +size 554266 diff --git a/illustrious_generated/a6bb9c91d34f.png b/illustrious_generated/a6bb9c91d34f.png new file mode 100644 index 0000000000000000000000000000000000000000..877994993f6529070f58992203476f224a08ad0e --- /dev/null +++ b/illustrious_generated/a6bb9c91d34f.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a33e82ed24d99207042533e9b6b7973b332e251b452665b87365e60b9cbbce2c +size 1738820 diff --git a/illustrious_generated/a96f15690715.png b/illustrious_generated/a96f15690715.png new file mode 100644 index 0000000000000000000000000000000000000000..1800a5f34749c30f40c36692c3621e5e0931c638 --- /dev/null +++ b/illustrious_generated/a96f15690715.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:41d3c11f9545c6049ae96be8a65a826d2d193e9b41cd29d851b9cb8f08d5cae9 +size 2037202 diff --git a/illustrious_generated/aa12167e1b22.png b/illustrious_generated/aa12167e1b22.png new file mode 100644 index 0000000000000000000000000000000000000000..04629bdee0c85a4406a290d12bf0dba83be236e4 --- /dev/null +++ b/illustrious_generated/aa12167e1b22.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e5255ec62cf2c1b5cec36564bb36103331c1500bb7e537b2ebb194817fac16bb +size 1012013 diff --git a/illustrious_generated/ac13afd03cf4.png b/illustrious_generated/ac13afd03cf4.png new file mode 100644 index 0000000000000000000000000000000000000000..0979ffb4ac9ac09ffdbb66cb2c60c7e8241ca94a --- /dev/null +++ b/illustrious_generated/ac13afd03cf4.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f964755c86a389c8a4daf4c8a9f2ff1010a4b27536c3c6f23a154c5500b6ebd4 +size 3149185 diff --git a/illustrious_generated/ae44ccdc53f3.png b/illustrious_generated/ae44ccdc53f3.png new file mode 100644 index 0000000000000000000000000000000000000000..51c37476efb6a0a9cc0e571d131aa40c38c39f66 --- /dev/null +++ b/illustrious_generated/ae44ccdc53f3.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a7f7fc18bf88b1420641f93b2725a74eaa7dee7733b48d119b35a19045bcc6ca +size 1491226 diff --git a/illustrious_generated/b02dd2e58a3a.png b/illustrious_generated/b02dd2e58a3a.png new file mode 100644 index 0000000000000000000000000000000000000000..0c7a85ff9f100b82ed87d2c95986a9b1a505e2f4 --- /dev/null +++ b/illustrious_generated/b02dd2e58a3a.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1980e7a47f15c82f5cd113f14a5f312a7fe1b8f85502f7a58c7494737586073d +size 3001125 diff --git a/illustrious_generated/b3e4152bedd8.png b/illustrious_generated/b3e4152bedd8.png new file mode 100644 index 0000000000000000000000000000000000000000..6615cb7d5d109cae8fdb07a95ba15fe175963d93 --- /dev/null +++ b/illustrious_generated/b3e4152bedd8.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:52267ebb09161e6a63a0901a731e3ad5d4b1eea97dab2d0d21f66adbaecbf55b +size 812279 diff --git a/illustrious_generated/b41ab503548d.png b/illustrious_generated/b41ab503548d.png new file mode 100644 index 0000000000000000000000000000000000000000..c28332632e996fed2c28a45e74b674adafefda73 --- /dev/null +++ b/illustrious_generated/b41ab503548d.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e7d65f5536a4ac3e18196e03ba2142fdc93ec7b99e4936a7560216cba20a6926 +size 1079924 diff --git a/illustrious_generated/b497d88bf627.png b/illustrious_generated/b497d88bf627.png new file mode 100644 index 0000000000000000000000000000000000000000..30e2cd502ce796cddf45cee43c701d5941aab5d3 --- /dev/null +++ b/illustrious_generated/b497d88bf627.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:645f8e4524a70843702523741ba3941d57011d23ecae68b0e93571e5df343b59 +size 1694316 diff --git a/illustrious_generated/b4d7041ee056.png b/illustrious_generated/b4d7041ee056.png new file mode 100644 index 0000000000000000000000000000000000000000..cb3ba62cba38235c6a7b6f995dcabd36428fe796 --- /dev/null +++ b/illustrious_generated/b4d7041ee056.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a2b2d02a751ea252fa921499a9de653ebdad5f28e9c7924d683568ab6bd8170f +size 1529227 diff --git a/illustrious_generated/b60c16076bb4.png b/illustrious_generated/b60c16076bb4.png new file mode 100644 index 0000000000000000000000000000000000000000..b8af3fd0deee856e3b8505fc8d51be8424a228d3 --- /dev/null +++ b/illustrious_generated/b60c16076bb4.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f44097393f678b29891d067d3569384220019edbaada8946be339be2d106a761 +size 2716512 diff --git a/illustrious_generated/b6ea090258fa.png b/illustrious_generated/b6ea090258fa.png new file mode 100644 index 0000000000000000000000000000000000000000..5952b9792d8fa5e85ebf3d3ffd0d5e99e3110f29 --- /dev/null +++ b/illustrious_generated/b6ea090258fa.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e0e59caac5a297dd7fd8c9b479cfba86216db4fb77d168c4e74b9bfbc2619218 +size 4991741 diff --git a/illustrious_generated/b7a1f3ab80ee.png b/illustrious_generated/b7a1f3ab80ee.png new file mode 100644 index 0000000000000000000000000000000000000000..ac24da6e143ddedcf766b4ea21080bb0acd062e5 --- /dev/null +++ b/illustrious_generated/b7a1f3ab80ee.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:26f9e539444e4dbb7ee5534e2046e19c06e7046c42021ba7737db96fdeb3e402 +size 2845208 diff --git a/illustrious_generated/b86b095aad9e.png b/illustrious_generated/b86b095aad9e.png new file mode 100644 index 0000000000000000000000000000000000000000..a89f13806e94bd6bc7ae4426f7ff9c1e75e98cff --- /dev/null +++ b/illustrious_generated/b86b095aad9e.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4f65e2ee0465d8a0b623f96ff3cf978fab7a274affd1cc86cfd0fecfb369e4ee +size 1120113 diff --git a/illustrious_generated/b919ce31fadb.png b/illustrious_generated/b919ce31fadb.png new file mode 100644 index 0000000000000000000000000000000000000000..03fb6c7910e31158497354406dd4df6d66091bdf --- /dev/null +++ b/illustrious_generated/b919ce31fadb.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bb4d4ff5afd2075ca433e7fabaa97591f490121fab501512a6679dcd9393c6e6 +size 2612488 diff --git a/illustrious_generated/b9bb1a4f4e90.png b/illustrious_generated/b9bb1a4f4e90.png new file mode 100644 index 0000000000000000000000000000000000000000..13b2aa829db97bfa1d170b4139086240639b83e1 --- /dev/null +++ b/illustrious_generated/b9bb1a4f4e90.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:da7fad405a8972898c2c3daf8877531f370f06b071cfcf2509a7fae4eb4f4a18 +size 1583179 diff --git a/illustrious_generated/bbd198e1056b.png b/illustrious_generated/bbd198e1056b.png new file mode 100644 index 0000000000000000000000000000000000000000..a4d2e652a141f466bd9f10b9a4043a4689a5bdf8 --- /dev/null +++ b/illustrious_generated/bbd198e1056b.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d9ed5c78549a06bfc49865d7bcba3350a2243c4050689940bda122df98436f1a +size 2125987 diff --git a/illustrious_generated/bd237c20c4a4.png b/illustrious_generated/bd237c20c4a4.png new file mode 100644 index 0000000000000000000000000000000000000000..edc02c086e745485305fd03b7459584c9d4cd409 --- /dev/null +++ b/illustrious_generated/bd237c20c4a4.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ea64be2c23f2fcdc357c0ab7b6d2ae020fbb91b57a74f538a7ecb653dcfa7316 +size 438098 diff --git a/illustrious_generated/bd7187a15eb7.png b/illustrious_generated/bd7187a15eb7.png new file mode 100644 index 0000000000000000000000000000000000000000..70295073a17d7ecff6e84755dd6922258f1734ae --- /dev/null +++ b/illustrious_generated/bd7187a15eb7.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9d4db1858e8665a31aba2b80bed1c008a7c1e4b2cc165d8ce363f33dc157adf8 +size 960074 diff --git a/illustrious_generated/be943688a592.png b/illustrious_generated/be943688a592.png new file mode 100644 index 0000000000000000000000000000000000000000..9657c1613e7ddb8acd32b4d7fdc39366ec805c31 --- /dev/null +++ b/illustrious_generated/be943688a592.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6a02b50a7f950e15368b64f4b8d4f42d1227ea58badb213a46cea459e057aeaa +size 889196 diff --git a/illustrious_generated/bee14f23ce18.png b/illustrious_generated/bee14f23ce18.png new file mode 100644 index 0000000000000000000000000000000000000000..d78ed14855d26bc74b861fbe99bd69814c1cec31 --- /dev/null +++ b/illustrious_generated/bee14f23ce18.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:63f8153dec9158ad823e8f207cc170ff757212240a8049471205e7c1e41eb4e6 +size 3624361 diff --git a/illustrious_generated/c1f19445e27b.png b/illustrious_generated/c1f19445e27b.png new file mode 100644 index 0000000000000000000000000000000000000000..8f393f9a512ec9d388d53e2a9ef7cb6cbb4b3f62 --- /dev/null +++ b/illustrious_generated/c1f19445e27b.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e10e39f32e8cfbfbaa7e051187e78f02728d063a3e5a4fcfa1dd71ab376597e4 +size 2080197 diff --git a/illustrious_generated/c2a3f4eaf18f.png b/illustrious_generated/c2a3f4eaf18f.png new file mode 100644 index 0000000000000000000000000000000000000000..76f0bcb681c08e9d06f2aae9c18efbf22675f918 --- /dev/null +++ b/illustrious_generated/c2a3f4eaf18f.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:01db23c3c4f1cc2dc07bda401e6cd54b18c9c876977c182e765b1c93973caa4a +size 2290394 diff --git a/illustrious_generated/c315f64e722e.png b/illustrious_generated/c315f64e722e.png new file mode 100644 index 0000000000000000000000000000000000000000..15a405ba207fe718a2cc01b7f4947e7782ddb7c8 --- /dev/null +++ b/illustrious_generated/c315f64e722e.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3d45bdb9544b5028bb9ea8c44d78c937a716a0c29223991fa9785b848238526e +size 2182676 diff --git a/illustrious_generated/c3b6106b5d1d.png b/illustrious_generated/c3b6106b5d1d.png new file mode 100644 index 0000000000000000000000000000000000000000..466f357aa8bf42d0e0d86abc4da1ab6be5d0f567 --- /dev/null +++ b/illustrious_generated/c3b6106b5d1d.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a4b152197e559cab32b2ef364923a0588705f2538bf4e64320e17dc1b046c4c0 +size 3224991 diff --git a/illustrious_generated/c5c9559835a0.png b/illustrious_generated/c5c9559835a0.png new file mode 100644 index 0000000000000000000000000000000000000000..5ca088637fdadff05857fb40d5a735083081745c --- /dev/null +++ b/illustrious_generated/c5c9559835a0.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c16d2cda1b8ff5830b65b1dea6af407bab57836a47e284defecb848b97eb6af5 +size 4024924 diff --git a/illustrious_generated/c6b905cdcfeb.png b/illustrious_generated/c6b905cdcfeb.png new file mode 100644 index 0000000000000000000000000000000000000000..6441e0dd9417a8d0435394ede5fa2999fb259e49 --- /dev/null +++ b/illustrious_generated/c6b905cdcfeb.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2583b93a5ef73b0a70f2390b6b9942782da8d9ace3474232413f9de473ae5a37 +size 1685111 diff --git a/illustrious_generated/c8effc769399.png b/illustrious_generated/c8effc769399.png new file mode 100644 index 0000000000000000000000000000000000000000..a6417441f2349b35ab67e76da653c66a35cb4cbf --- /dev/null +++ b/illustrious_generated/c8effc769399.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4effc5ba110dad7d763e5d5ee5fb5353254f33c604a580c8b036c1d9b1277aaa +size 4155838 diff --git a/illustrious_generated/c99fbd49f81e.png b/illustrious_generated/c99fbd49f81e.png new file mode 100644 index 0000000000000000000000000000000000000000..5f3aba5832cf36dfb9521ce809bd527a276b3659 --- /dev/null +++ b/illustrious_generated/c99fbd49f81e.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8062dc29d3729b2cce90906c828485cd21c299e7da1b6ff90733c9eceb754947 +size 2266901 diff --git a/illustrious_generated/ca26ac4bf239.png b/illustrious_generated/ca26ac4bf239.png new file mode 100644 index 0000000000000000000000000000000000000000..20b304662bf01b590b485608d9ca58066f803b39 --- /dev/null +++ b/illustrious_generated/ca26ac4bf239.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:86bd00f838c3115d996af7be75e428178b1952fc4f10ec75faaefa1a594aa7eb +size 923688 diff --git a/illustrious_generated/ca6f517ea8fa.png b/illustrious_generated/ca6f517ea8fa.png new file mode 100644 index 0000000000000000000000000000000000000000..660ab3e4b6564e6eeaddaba02565f6fa0ae3cf62 --- /dev/null +++ b/illustrious_generated/ca6f517ea8fa.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b280eac4668719ba3b44e03152611476ce6912859ae0846f2bb55b116b946b39 +size 838322 diff --git a/illustrious_generated/cba2d736599c.png b/illustrious_generated/cba2d736599c.png new file mode 100644 index 0000000000000000000000000000000000000000..ff02a726a589b5b7e2710a578bc651b2d65cef39 --- /dev/null +++ b/illustrious_generated/cba2d736599c.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:361ce36c381c3939fc7ce5c902acb838843210cd887908051863890d252b20a6 +size 733309 diff --git a/illustrious_generated/cbf43ed5326a.png b/illustrious_generated/cbf43ed5326a.png new file mode 100644 index 0000000000000000000000000000000000000000..70e4cd95e944beb8409d1eda870ea72ef91af619 --- /dev/null +++ b/illustrious_generated/cbf43ed5326a.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:71be31dd3e325df0506777e397bbf898499f67bc00919d35cca93bedd099141d +size 2559180 diff --git a/illustrious_generated/cc21e3368371.png b/illustrious_generated/cc21e3368371.png new file mode 100644 index 0000000000000000000000000000000000000000..be4268d49684f695e3c94c08804203f0c8bc36ed --- /dev/null +++ b/illustrious_generated/cc21e3368371.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3e802797e1c90299147cc501ea151411a79701782e1d1c8db64e4f22febfae44 +size 2213758 diff --git a/illustrious_generated/cc49299f0c4f.png b/illustrious_generated/cc49299f0c4f.png new file mode 100644 index 0000000000000000000000000000000000000000..7e0a056e7ce7b6f3fbd504e0d6090e23768d3f31 --- /dev/null +++ b/illustrious_generated/cc49299f0c4f.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9d2e71f194e81ac43a2587c66dbdf1b215c0241d33692ec6690ee74a57c5dcc0 +size 1067248 diff --git a/illustrious_generated/cd971f942cae.png b/illustrious_generated/cd971f942cae.png new file mode 100644 index 0000000000000000000000000000000000000000..9bcf6ceb2aa6a1896bf0e761f5a44abd0b8e923c --- /dev/null +++ b/illustrious_generated/cd971f942cae.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:378f264e4279136ec7e9557291e3328e364704a2b520288cab65b3a4376d622b +size 742750 diff --git a/illustrious_generated/cdb2b548c854.png b/illustrious_generated/cdb2b548c854.png new file mode 100644 index 0000000000000000000000000000000000000000..c6439ee9c1e5b077b5a3034c766c889c0c2f530c --- /dev/null +++ b/illustrious_generated/cdb2b548c854.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:57bb3e58f368001f2e3b271148eedf5de693e6f7eef3f15a89cb76d6c2200c96 +size 709842 diff --git a/illustrious_generated/ced077dcba6a.png b/illustrious_generated/ced077dcba6a.png new file mode 100644 index 0000000000000000000000000000000000000000..626d048cf8647e02768678081a1d03feb05bbc38 --- /dev/null +++ b/illustrious_generated/ced077dcba6a.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:947b609db11cd15231192d09f8f42065012544eccabb134bc827ea1b95e8c92d +size 2164492 diff --git a/illustrious_generated/cf18aa678c82.png b/illustrious_generated/cf18aa678c82.png new file mode 100644 index 0000000000000000000000000000000000000000..4055c726b8ee66adddf9aa402c9581aeceb76607 --- /dev/null +++ b/illustrious_generated/cf18aa678c82.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e7cc90cfa85e3f5c4680c5b85044109771f8bdaf68884b93ffe7e56611610dde +size 812377 diff --git a/illustrious_generated/d337c665d640.png b/illustrious_generated/d337c665d640.png new file mode 100644 index 0000000000000000000000000000000000000000..a050de3547aa468ae30bfd40cdafb1a31869c105 --- /dev/null +++ b/illustrious_generated/d337c665d640.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8dc43202c8a3f1b4e91f3c04b96748f60fbc16995dc3253a814a54f9de1c0802 +size 483819 diff --git a/illustrious_generated/d3bd2a22b273.png b/illustrious_generated/d3bd2a22b273.png new file mode 100644 index 0000000000000000000000000000000000000000..e807b5d309f4e10bae94056ba53b72a697b9821e --- /dev/null +++ b/illustrious_generated/d3bd2a22b273.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:66c2458b0bc55d8bb9deb594c5f17560844881071d859332ae5093ddee0dd7b0 +size 384429 diff --git a/illustrious_generated/d48dd85a0e62.png b/illustrious_generated/d48dd85a0e62.png new file mode 100644 index 0000000000000000000000000000000000000000..0ee2560edef1f314a087598a6d642791903f605d --- /dev/null +++ b/illustrious_generated/d48dd85a0e62.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:094aabac63b38da4d04519766fe850da9233731a1eb516dcbeb117cb9621e69b +size 3353138 diff --git a/illustrious_generated/d563fc3ed6cf.png b/illustrious_generated/d563fc3ed6cf.png new file mode 100644 index 0000000000000000000000000000000000000000..8118a4da13b4826b91e3aebd5ac692edaa9c4b6b --- /dev/null +++ b/illustrious_generated/d563fc3ed6cf.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:61b3d7fbfbc0112506ddc2069779c9afb1401988e373b51c082437b3b20ec2a4 +size 1644114 diff --git a/illustrious_generated/d62bf5dcbc94.png b/illustrious_generated/d62bf5dcbc94.png new file mode 100644 index 0000000000000000000000000000000000000000..bc3a335f7488ff9a9466835cdfe7640374184e77 --- /dev/null +++ b/illustrious_generated/d62bf5dcbc94.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d455c1754e8ece7af27b2028e726f134b4b9aaa9cf4bb38582d9832fee5470ed +size 1294640 diff --git a/illustrious_generated/d6c9f0a9e161.png b/illustrious_generated/d6c9f0a9e161.png new file mode 100644 index 0000000000000000000000000000000000000000..3ac2c2e712f0c174495b254f71db36809b749d3e --- /dev/null +++ b/illustrious_generated/d6c9f0a9e161.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ffa06cd736cafa8fb53cd4963a25997daf34d19ab0a8a8a87018ba721b0d788e +size 1001675 diff --git a/illustrious_generated/d72cfefd9423.png b/illustrious_generated/d72cfefd9423.png new file mode 100644 index 0000000000000000000000000000000000000000..7aa3866387c973f1494057b70dc509d428f8617d --- /dev/null +++ b/illustrious_generated/d72cfefd9423.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b57ac4179bafea5d5ed85621b70d0bfc2994af040d19d21ad7ca354bedaabcfa +size 1576529 diff --git a/illustrious_generated/d979e3d32abf.png b/illustrious_generated/d979e3d32abf.png new file mode 100644 index 0000000000000000000000000000000000000000..07a587ca6004fb7e5da066238dbd722f412383a5 --- /dev/null +++ b/illustrious_generated/d979e3d32abf.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:29783c2e2a8846759484195b128d73889d225f5f8e93244aa25dcf08ffc7ee19 +size 2636763 diff --git a/illustrious_generated/d9e560672527.png b/illustrious_generated/d9e560672527.png new file mode 100644 index 0000000000000000000000000000000000000000..41127cee1f18a75ee0af83d9919ab3f95a5dc9ec --- /dev/null +++ b/illustrious_generated/d9e560672527.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e7b59f6852407252f3b5ee644d215aa01432ae4910dfee5027ad9f0590978d67 +size 3151830 diff --git a/illustrious_generated/d9f2aa1ba51d.png b/illustrious_generated/d9f2aa1ba51d.png new file mode 100644 index 0000000000000000000000000000000000000000..629c19b7b5a86220a1649bc05030813e4a5bbc8b --- /dev/null +++ b/illustrious_generated/d9f2aa1ba51d.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ce00e36a72335448f4654cf43306e90d0c5b3d231133fa7739b928ca0421e9dd +size 458824 diff --git a/illustrious_generated/da83ff193381.png b/illustrious_generated/da83ff193381.png new file mode 100644 index 0000000000000000000000000000000000000000..4721a112e26cbe678c4d9dce577b8ea2e5318a6e --- /dev/null +++ b/illustrious_generated/da83ff193381.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ab8a3b8708c0105fa4c4eec07000ea4104fd2ed06ec2bfb9ec7e1d47eeec6278 +size 4536627 diff --git a/illustrious_generated/daa4477a619b.png b/illustrious_generated/daa4477a619b.png new file mode 100644 index 0000000000000000000000000000000000000000..7aaad19bb423705a8c9eed3296566c164b8d716c --- /dev/null +++ b/illustrious_generated/daa4477a619b.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2c7e00ae505025fb5c67ecef4d87063f341c47cc3abd6dc1a12de972328cee49 +size 750149 diff --git a/illustrious_generated/dbd730eadc95.png b/illustrious_generated/dbd730eadc95.png new file mode 100644 index 0000000000000000000000000000000000000000..2422f585e1bd5bd28195f206b32206190c144cce --- /dev/null +++ b/illustrious_generated/dbd730eadc95.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b734958331690d65c896d7f3a8806750ea47693c9b58805bec6745ce824cd942 +size 2197220 diff --git a/illustrious_generated/deab8fc10451.png b/illustrious_generated/deab8fc10451.png new file mode 100644 index 0000000000000000000000000000000000000000..76777c9896a904a7a43c68844323b4f311407851 --- /dev/null +++ b/illustrious_generated/deab8fc10451.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4464924dcbd9e526e904589a8de6cac24d3ecd7b9b2bc305545f34645d7d9e42 +size 953593 diff --git a/illustrious_generated/e08f6e84dc64.png b/illustrious_generated/e08f6e84dc64.png new file mode 100644 index 0000000000000000000000000000000000000000..f1763146ce5b80b08851b231a70ef2f9c2194db1 --- /dev/null +++ b/illustrious_generated/e08f6e84dc64.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:46243286a3604be98186162adf4fee47c689e85a24670a02aeaa7bb74042d5e4 +size 3016758 diff --git a/illustrious_generated/e0c9797bdcc4.png b/illustrious_generated/e0c9797bdcc4.png new file mode 100644 index 0000000000000000000000000000000000000000..1622628e9635b51722896cc2f52195121dabcaae --- /dev/null +++ b/illustrious_generated/e0c9797bdcc4.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c1b490c719b44f28be3dcea043daa17ee0ef73ba3965a65d1496d14de2dfd0e0 +size 382272 diff --git a/illustrious_generated/e0e1f519c17b.png b/illustrious_generated/e0e1f519c17b.png new file mode 100644 index 0000000000000000000000000000000000000000..f38631323868e1a87f7b069f37879868344373b4 --- /dev/null +++ b/illustrious_generated/e0e1f519c17b.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1fa3431c1b5987813a5ad95958781370472306434974f8dcc2e70ea96678232f +size 1558493 diff --git a/illustrious_generated/e0e2270fe38c.png b/illustrious_generated/e0e2270fe38c.png new file mode 100644 index 0000000000000000000000000000000000000000..110705cf0770073bcbd6b4f675b16ae480d4a485 --- /dev/null +++ b/illustrious_generated/e0e2270fe38c.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e804d45b639443cf3702c096fdf3aa5ca7e47a125c1f0ac3d68cc1c31ab04d0d +size 1624100 diff --git a/illustrious_generated/e30322f764fa.png b/illustrious_generated/e30322f764fa.png new file mode 100644 index 0000000000000000000000000000000000000000..b08c4737f4e5f2365bafb97cad12c4d3fe576f26 --- /dev/null +++ b/illustrious_generated/e30322f764fa.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d628a45dd0b32ca9e40f1cc5dd42042dd65e2856357fbd908f7794d67601be8c +size 4427167 diff --git a/illustrious_generated/e5eae5f51f9a.png b/illustrious_generated/e5eae5f51f9a.png new file mode 100644 index 0000000000000000000000000000000000000000..76ede5c46409036a2ca50babd9aaec0428663cd1 --- /dev/null +++ b/illustrious_generated/e5eae5f51f9a.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9972ce00b482ec146f8fede3a299d4580989e39ac179baf7eaac63edf7f01cdb +size 1733043 diff --git a/illustrious_generated/e7153dbd46c4.png b/illustrious_generated/e7153dbd46c4.png new file mode 100644 index 0000000000000000000000000000000000000000..5b680221aa31c208fc0f428ca929d8708159a15c --- /dev/null +++ b/illustrious_generated/e7153dbd46c4.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c700de051e69829f69903a027fe1d3d3f84b08038ad1ac117e2297a6c60466e3 +size 605752 diff --git a/illustrious_generated/e886088f9603.png b/illustrious_generated/e886088f9603.png new file mode 100644 index 0000000000000000000000000000000000000000..9a0382ae7d8349d6b2431a17cd1a6fb78154a120 --- /dev/null +++ b/illustrious_generated/e886088f9603.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c84fe22033cb7b2cdcf48cb7fa1029d2e2b01c2463fe5dbb76bf9a6739c392f2 +size 4017300 diff --git a/illustrious_generated/e8d2670c6e8a.png b/illustrious_generated/e8d2670c6e8a.png new file mode 100644 index 0000000000000000000000000000000000000000..62dfc09f90b9302031d7c4fa39a72fa6b8d29f0d --- /dev/null +++ b/illustrious_generated/e8d2670c6e8a.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a6b075f2f321b3b00ffc8c5ffb161536f1f32ce72cfff8eb7f51a88a08118359 +size 2241015 diff --git a/illustrious_generated/e9dfcff21425.png b/illustrious_generated/e9dfcff21425.png new file mode 100644 index 0000000000000000000000000000000000000000..039000a3b6733e233f3c854e1577f1074a1ef272 --- /dev/null +++ b/illustrious_generated/e9dfcff21425.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:acb0365cfbd702151dbc7d93dfcb01163d7a76bd9fdc2b07e8089a5302c4c7ae +size 1732855 diff --git a/illustrious_generated/ea26c25bf7aa.png b/illustrious_generated/ea26c25bf7aa.png new file mode 100644 index 0000000000000000000000000000000000000000..03d201cffa67a48667ea1a776ad64c757ce66e20 --- /dev/null +++ b/illustrious_generated/ea26c25bf7aa.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4f35c33d91f03418194ac745e581e6559356b8ee62dd2262b0d568fee40357b7 +size 2775067 diff --git a/illustrious_generated/ea9390c9334a.png b/illustrious_generated/ea9390c9334a.png new file mode 100644 index 0000000000000000000000000000000000000000..1395fdc36f01c91584ab58c0e30d6e36f28d9056 --- /dev/null +++ b/illustrious_generated/ea9390c9334a.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c3ee1a558e4b00f414949decb8c23dc45d8ebcea4a4ba1637bd1d3281cd23d43 +size 3092948 diff --git a/illustrious_generated/eae6fa8da581.png b/illustrious_generated/eae6fa8da581.png new file mode 100644 index 0000000000000000000000000000000000000000..50eda27f9fae9e2c7302c0b8d405622bdfa9240c --- /dev/null +++ b/illustrious_generated/eae6fa8da581.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ca318ad9ce44134a1a5bd1e838f46663f36cbfdd189ba2f9823fdd608e522c0d +size 2353312 diff --git a/illustrious_generated/eb0a51a08785.png b/illustrious_generated/eb0a51a08785.png new file mode 100644 index 0000000000000000000000000000000000000000..b5c6f0f50fd537c65767299692f16162bf3ca78f --- /dev/null +++ b/illustrious_generated/eb0a51a08785.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7c90754ef4cf8856473d152b269dc3b8b9f6aae88b777b2cbad453190fff1451 +size 1351643 diff --git a/illustrious_generated/ecec4646a402.png b/illustrious_generated/ecec4646a402.png new file mode 100644 index 0000000000000000000000000000000000000000..9d2b0244a9f4f330746a516febd12a03dad82964 --- /dev/null +++ b/illustrious_generated/ecec4646a402.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7ae737c50aac3cf9648601d9c64a7a60c5c6b69dd3d236ce355c6971795f34b2 +size 1818058 diff --git a/illustrious_generated/ee3d34de5c79.png b/illustrious_generated/ee3d34de5c79.png new file mode 100644 index 0000000000000000000000000000000000000000..7e2dec78e4fc50a808599fccfce7f0e79e5cb978 --- /dev/null +++ b/illustrious_generated/ee3d34de5c79.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dc359d217c423d2bf2efb172228dde9b6f9e06ea827f8deadc5aaa85c7537e1d +size 2187513 diff --git a/illustrious_generated/eff53c185549.png b/illustrious_generated/eff53c185549.png new file mode 100644 index 0000000000000000000000000000000000000000..809446715e5a70e13d0ece210e8bb1985aa38f05 --- /dev/null +++ b/illustrious_generated/eff53c185549.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e95101506ee4394eaae39ab9ea99df1edc1c46d3b60ccbbc313d7d6d53243ab6 +size 1654492 diff --git a/illustrious_generated/f1475a16a1cc.png b/illustrious_generated/f1475a16a1cc.png new file mode 100644 index 0000000000000000000000000000000000000000..b7756176b6e1160a73d1f01c6626f4ff25ab5c9d --- /dev/null +++ b/illustrious_generated/f1475a16a1cc.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8bb3fd043131a027fc32c7401da23fd0d868a077571ca1e7a40c910fa1db18c1 +size 900069 diff --git a/illustrious_generated/f149cbf64f42.png b/illustrious_generated/f149cbf64f42.png new file mode 100644 index 0000000000000000000000000000000000000000..aac2010fef48ac458683cb9163adeeda16ef9b00 --- /dev/null +++ b/illustrious_generated/f149cbf64f42.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ba3f9c137ef8b658a2d169ec2bad9f57e3533ec772ea634a238eb15d9df8186d +size 2315453 diff --git a/illustrious_generated/f5d412584d10.png b/illustrious_generated/f5d412584d10.png new file mode 100644 index 0000000000000000000000000000000000000000..809b7f4d721c63254814d5d84b6f435b78342db8 --- /dev/null +++ b/illustrious_generated/f5d412584d10.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f189824777642b953964303baf1a08bc4adf93cdb703ec9498e39cbfddc86385 +size 1328263 diff --git a/illustrious_generated/f7a76637f5ff.png b/illustrious_generated/f7a76637f5ff.png new file mode 100644 index 0000000000000000000000000000000000000000..35d3249258939db0f66034f84fdc38d84cda80f1 --- /dev/null +++ b/illustrious_generated/f7a76637f5ff.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:78fa47db1cd8c94025088b122b31a1219c117198f5b17027d4f19711f94ba237 +size 1609450 diff --git a/illustrious_generated/f84052e8b91d.png b/illustrious_generated/f84052e8b91d.png new file mode 100644 index 0000000000000000000000000000000000000000..89034bcf04a21df17cc94310349d681914e347e2 --- /dev/null +++ b/illustrious_generated/f84052e8b91d.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:65f990e9df51c2985e0a6f905f9bce23e864b18ef5edae76ba81bdd69c308187 +size 971956 diff --git a/illustrious_generated/f8866eadaa51.png b/illustrious_generated/f8866eadaa51.png new file mode 100644 index 0000000000000000000000000000000000000000..c0341ea1a38f0cad4f714950b75dd9377aa26c11 --- /dev/null +++ b/illustrious_generated/f8866eadaa51.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3ddb089d7fff315100a0ad9b9a6c1086b229561edef83de75c155fa5b1b60e7c +size 1227986 diff --git a/illustrious_generated/fa1bc4acdc0d.png b/illustrious_generated/fa1bc4acdc0d.png new file mode 100644 index 0000000000000000000000000000000000000000..1dfdd577c9bb1be46bbc89310740cc8dcab00bc0 --- /dev/null +++ b/illustrious_generated/fa1bc4acdc0d.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cde72a5487c44bc3a695f826499cf89725bd3854a4f1ddb86a0441a395731498 +size 3248752 diff --git a/illustrious_generated/fa5fabcf698e.png b/illustrious_generated/fa5fabcf698e.png new file mode 100644 index 0000000000000000000000000000000000000000..fb227d393c77caa10f807a2ff29fce32ec9d6cbe --- /dev/null +++ b/illustrious_generated/fa5fabcf698e.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8082bccfe1e3c86574d4853e047d4ca55717f9d6f28b9f17bfa233853c720d5f +size 1254727 diff --git a/illustrious_generated/fa83bef79f58.png b/illustrious_generated/fa83bef79f58.png new file mode 100644 index 0000000000000000000000000000000000000000..c3833b2050fd85d6f7102b8f3c9da36ea2e5c5f7 --- /dev/null +++ b/illustrious_generated/fa83bef79f58.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:504fa34bb8af5b4800cbdd5d7ae62c5abd384ad40112c34f6996ab7dbe5eabf7 +size 3016993 diff --git a/illustrious_generated/fb837ee1983a.png b/illustrious_generated/fb837ee1983a.png new file mode 100644 index 0000000000000000000000000000000000000000..cedcbc1a16f09a7776c5bdd5746c6d52f4c30420 --- /dev/null +++ b/illustrious_generated/fb837ee1983a.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d59f5ccd4cca73856263bdc6e7cff5f8d08306e5d4e513dcf10d8415ab375b82 +size 3114265 diff --git a/illustrious_generated/fb96879652d6.png b/illustrious_generated/fb96879652d6.png new file mode 100644 index 0000000000000000000000000000000000000000..69d7944b548618031e4ebe3bb7619ece0bcbc763 --- /dev/null +++ b/illustrious_generated/fb96879652d6.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:860d932f1d8612cf508857cd3b58960a55360651bc135df2bca9baf499bc8a7a +size 983313 diff --git a/illustrious_generated/fc4cfe2d8bbd.png b/illustrious_generated/fc4cfe2d8bbd.png new file mode 100644 index 0000000000000000000000000000000000000000..55bc3016fa0d545ea00c59ea2ef36003b0eb425b --- /dev/null +++ b/illustrious_generated/fc4cfe2d8bbd.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c3b8ebbe24fc88947b7018e3c3dce204543a717129dcc94df8ac38c335a77b5d +size 1139918 diff --git a/illustrious_generated/fc7d541ba16e.png b/illustrious_generated/fc7d541ba16e.png new file mode 100644 index 0000000000000000000000000000000000000000..b20f68927f8160ca62052a44abcf3c02ab97a136 --- /dev/null +++ b/illustrious_generated/fc7d541ba16e.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2b9dd20996792e41ae8a2578786a8918f9207bcfe9df3c9bea1339d93eb8e7ff +size 635033 diff --git a/illustrious_generated/fcc811dfea64.png b/illustrious_generated/fcc811dfea64.png new file mode 100644 index 0000000000000000000000000000000000000000..ec956969b030fe01bba0491d2139556b28285d49 --- /dev/null +++ b/illustrious_generated/fcc811dfea64.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5cbd46201cef022e46573a3c0f482b8654c76760aaec6f515d8fca5be4d5a479 +size 2810754 diff --git a/illustrious_generated/fe6ae3e9c893.png b/illustrious_generated/fe6ae3e9c893.png new file mode 100644 index 0000000000000000000000000000000000000000..3280f8ddd3ca30725c9f10cdad857cc2825d0147 --- /dev/null +++ b/illustrious_generated/fe6ae3e9c893.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1a3c71bc6ece71bd2d4133298b23434ea901160b89b3582c565a25376ca06fe2 +size 2275382 diff --git a/illustrious_generated/fea6cb2e79bc.png b/illustrious_generated/fea6cb2e79bc.png new file mode 100644 index 0000000000000000000000000000000000000000..2429880462d4e061622cbb9eadb3d5f80b7a867a --- /dev/null +++ b/illustrious_generated/fea6cb2e79bc.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8d3e71484cfe75431b712114866a6585c611176ee279549d017c58b28cd40995 +size 1229429 diff --git a/illustrious_generated/feb2e566f26d.png b/illustrious_generated/feb2e566f26d.png new file mode 100644 index 0000000000000000000000000000000000000000..78d9868a789754f31039b8446ca8596fc55168b4 --- /dev/null +++ b/illustrious_generated/feb2e566f26d.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:246faa91da44b60e083e7e0029c42d32472921a923a3000304077902f09d51d4 +size 1587033 diff --git a/output.png b/output.png new file mode 100644 index 0000000000000000000000000000000000000000..6af42dd9682ae9f34c13155ce8618e935bdd72af --- /dev/null +++ b/output.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:54c2f0a8b88c286c4fbc837a8f4c76ed650852e061a461d33a49b6170b9711ac +size 1266194 diff --git a/test_qwen_sdxl_1.png b/test_qwen_sdxl_1.png new file mode 100644 index 0000000000000000000000000000000000000000..7e0b260eff5c6e7d635d75e29c773482fde0743b --- /dev/null +++ b/test_qwen_sdxl_1.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a907e20065eca4132d9d8ecab476fe6f21d6e2a5983edeec75b672ba9a6c2c7e +size 193562 diff --git a/test_qwen_sdxl_2.png b/test_qwen_sdxl_2.png new file mode 100644 index 0000000000000000000000000000000000000000..f0b37f9b9cf9ce3e50bc1837faa7e082e2dfd45e --- /dev/null +++ b/test_qwen_sdxl_2.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ea6d1d116fba91844572ae5a8eabaedaf33de77792b9a2d3e69b24904ee63d11 +size 366508 diff --git a/transformers/src/transformers/__pycache__/__init__.cpython-312.pyc b/transformers/src/transformers/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ae57f45c485c70e6fe19dcc1041ebeb46bf0046c Binary files /dev/null and b/transformers/src/transformers/__pycache__/__init__.cpython-312.pyc differ diff --git a/transformers/src/transformers/__pycache__/activations.cpython-312.pyc b/transformers/src/transformers/__pycache__/activations.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..51474252f6d571ca9b6123224c3515e9c7826ecb Binary files /dev/null and b/transformers/src/transformers/__pycache__/activations.cpython-312.pyc differ diff --git a/transformers/src/transformers/__pycache__/audio_utils.cpython-312.pyc b/transformers/src/transformers/__pycache__/audio_utils.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..578305421f46522642b74083854244ef19c1cb81 Binary files /dev/null and b/transformers/src/transformers/__pycache__/audio_utils.cpython-312.pyc differ diff --git a/transformers/src/transformers/__pycache__/configuration_utils.cpython-312.pyc b/transformers/src/transformers/__pycache__/configuration_utils.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1f97f9cc2849a570a6b96d16fc82aae57cb71ad7 Binary files /dev/null and b/transformers/src/transformers/__pycache__/configuration_utils.cpython-312.pyc differ diff --git a/transformers/src/transformers/__pycache__/convert_slow_tokenizer.cpython-312.pyc b/transformers/src/transformers/__pycache__/convert_slow_tokenizer.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..aeed278c9181b10c54a18ede4ebcd354cfea562f Binary files /dev/null and b/transformers/src/transformers/__pycache__/convert_slow_tokenizer.cpython-312.pyc differ diff --git a/transformers/src/transformers/__pycache__/debug_utils.cpython-312.pyc b/transformers/src/transformers/__pycache__/debug_utils.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..63a3a799d7d6a0e7f923dfe977443fbcf2d7edcb Binary files /dev/null and b/transformers/src/transformers/__pycache__/debug_utils.cpython-312.pyc differ diff --git a/transformers/src/transformers/__pycache__/dependency_versions_check.cpython-312.pyc b/transformers/src/transformers/__pycache__/dependency_versions_check.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..70970dd4b30924fd17f4bb712b9b9480cb3f0b6b Binary files /dev/null and b/transformers/src/transformers/__pycache__/dependency_versions_check.cpython-312.pyc differ diff --git a/transformers/src/transformers/__pycache__/dependency_versions_table.cpython-312.pyc b/transformers/src/transformers/__pycache__/dependency_versions_table.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..83389a4b9953dde6180457219d826f878dd0ca3c Binary files /dev/null and b/transformers/src/transformers/__pycache__/dependency_versions_table.cpython-312.pyc differ diff --git a/transformers/src/transformers/__pycache__/dynamic_module_utils.cpython-312.pyc b/transformers/src/transformers/__pycache__/dynamic_module_utils.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..975d30fa8f14cdb9fecc5806b221a5ceef7168c2 Binary files /dev/null and b/transformers/src/transformers/__pycache__/dynamic_module_utils.cpython-312.pyc differ diff --git a/transformers/src/transformers/__pycache__/feature_extraction_sequence_utils.cpython-312.pyc b/transformers/src/transformers/__pycache__/feature_extraction_sequence_utils.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ae15bfc4d09062fadc171d0ded5c49f91674f63c Binary files /dev/null and b/transformers/src/transformers/__pycache__/feature_extraction_sequence_utils.cpython-312.pyc differ diff --git a/transformers/src/transformers/__pycache__/feature_extraction_utils.cpython-312.pyc b/transformers/src/transformers/__pycache__/feature_extraction_utils.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..497e0e7c6e0146c11e20db38c5f6d97e50a95c4a Binary files /dev/null and b/transformers/src/transformers/__pycache__/feature_extraction_utils.cpython-312.pyc differ diff --git a/transformers/src/transformers/__pycache__/file_utils.cpython-312.pyc b/transformers/src/transformers/__pycache__/file_utils.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1e48228f67983994a812c23f88a1680a859f63ca Binary files /dev/null and b/transformers/src/transformers/__pycache__/file_utils.cpython-312.pyc differ diff --git a/transformers/src/transformers/__pycache__/hyperparameter_search.cpython-312.pyc b/transformers/src/transformers/__pycache__/hyperparameter_search.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..44d030ef579bf29b5c96221fd684d0062f590de1 Binary files /dev/null and b/transformers/src/transformers/__pycache__/hyperparameter_search.cpython-312.pyc differ diff --git a/transformers/src/transformers/__pycache__/image_processing_base.cpython-312.pyc b/transformers/src/transformers/__pycache__/image_processing_base.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5bd6765e50b7237a64701ba6cf950a50a1fe6d69 Binary files /dev/null and b/transformers/src/transformers/__pycache__/image_processing_base.cpython-312.pyc differ diff --git a/transformers/src/transformers/__pycache__/image_processing_utils.cpython-312.pyc b/transformers/src/transformers/__pycache__/image_processing_utils.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0ea2a29840c7aa31ba9718b42798baea1129ba61 Binary files /dev/null and b/transformers/src/transformers/__pycache__/image_processing_utils.cpython-312.pyc differ diff --git a/transformers/src/transformers/__pycache__/image_processing_utils_fast.cpython-312.pyc b/transformers/src/transformers/__pycache__/image_processing_utils_fast.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2cd617e33a1c36bbe366c8cb5ba19b66daefdbfe Binary files /dev/null and b/transformers/src/transformers/__pycache__/image_processing_utils_fast.cpython-312.pyc differ diff --git a/transformers/src/transformers/__pycache__/image_transforms.cpython-312.pyc b/transformers/src/transformers/__pycache__/image_transforms.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f908c8e3e8359eb12ef1afa26e819367f27792e9 Binary files /dev/null and b/transformers/src/transformers/__pycache__/image_transforms.cpython-312.pyc differ diff --git a/transformers/src/transformers/__pycache__/image_utils.cpython-312.pyc b/transformers/src/transformers/__pycache__/image_utils.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..47e3e81bf2a5ab3a0aee48f43500128bbbccf6fc Binary files /dev/null and b/transformers/src/transformers/__pycache__/image_utils.cpython-312.pyc differ diff --git a/transformers/src/transformers/__pycache__/masking_utils.cpython-312.pyc b/transformers/src/transformers/__pycache__/masking_utils.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5df0fb1425d46f20c9b1bdaa7218c98f83c805d0 Binary files /dev/null and b/transformers/src/transformers/__pycache__/masking_utils.cpython-312.pyc differ diff --git a/transformers/src/transformers/__pycache__/modelcard.cpython-312.pyc b/transformers/src/transformers/__pycache__/modelcard.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b7c4d2ad66d505a10a5a2e04129c782c966282a1 Binary files /dev/null and b/transformers/src/transformers/__pycache__/modelcard.cpython-312.pyc differ diff --git a/transformers/src/transformers/__pycache__/modeling_attn_mask_utils.cpython-312.pyc b/transformers/src/transformers/__pycache__/modeling_attn_mask_utils.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..360606d8a4293c71418cde9bd3d5f3d6e6c1c9d6 Binary files /dev/null and b/transformers/src/transformers/__pycache__/modeling_attn_mask_utils.cpython-312.pyc differ diff --git a/transformers/src/transformers/__pycache__/modeling_flash_attention_utils.cpython-312.pyc b/transformers/src/transformers/__pycache__/modeling_flash_attention_utils.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..cf41417ea79607d8d8d686a15557cbbe3a178728 Binary files /dev/null and b/transformers/src/transformers/__pycache__/modeling_flash_attention_utils.cpython-312.pyc differ diff --git a/transformers/src/transformers/__pycache__/modeling_gguf_pytorch_utils.cpython-312.pyc b/transformers/src/transformers/__pycache__/modeling_gguf_pytorch_utils.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..310d5fda05afaa59438010dfdf89bb5ad0d86815 Binary files /dev/null and b/transformers/src/transformers/__pycache__/modeling_gguf_pytorch_utils.cpython-312.pyc differ diff --git a/transformers/src/transformers/__pycache__/modeling_layers.cpython-312.pyc b/transformers/src/transformers/__pycache__/modeling_layers.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..93bc67c224f9d661cb42c03d1ade748a7977b0fd Binary files /dev/null and b/transformers/src/transformers/__pycache__/modeling_layers.cpython-312.pyc differ diff --git a/transformers/src/transformers/__pycache__/modeling_rope_utils.cpython-312.pyc b/transformers/src/transformers/__pycache__/modeling_rope_utils.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..52a1fdc7f93086a0361c8cbb074e7019c08a2376 Binary files /dev/null and b/transformers/src/transformers/__pycache__/modeling_rope_utils.cpython-312.pyc differ diff --git a/transformers/src/transformers/__pycache__/optimization.cpython-312.pyc b/transformers/src/transformers/__pycache__/optimization.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3613091fac8fd6c5bae1a2bf340d0ecae23e63f8 Binary files /dev/null and b/transformers/src/transformers/__pycache__/optimization.cpython-312.pyc differ diff --git a/transformers/src/transformers/__pycache__/processing_utils.cpython-312.pyc b/transformers/src/transformers/__pycache__/processing_utils.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d7dee54c0eaba774a4bafc4ea4d9942da2790dbf Binary files /dev/null and b/transformers/src/transformers/__pycache__/processing_utils.cpython-312.pyc differ diff --git a/transformers/src/transformers/__pycache__/pytorch_utils.cpython-312.pyc b/transformers/src/transformers/__pycache__/pytorch_utils.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..88a67155de6fb8c673b27e6c3a2b1dd3da1fe061 Binary files /dev/null and b/transformers/src/transformers/__pycache__/pytorch_utils.cpython-312.pyc differ diff --git a/transformers/src/transformers/__pycache__/safetensors_conversion.cpython-312.pyc b/transformers/src/transformers/__pycache__/safetensors_conversion.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b6659e6e85b242b52deaa8f8f17e23397e9050c3 Binary files /dev/null and b/transformers/src/transformers/__pycache__/safetensors_conversion.cpython-312.pyc differ diff --git a/transformers/src/transformers/__pycache__/tokenization_utils.cpython-312.pyc b/transformers/src/transformers/__pycache__/tokenization_utils.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6b3b0e0231a1a3b3c5b9f34411c5db52a85e7a29 Binary files /dev/null and b/transformers/src/transformers/__pycache__/tokenization_utils.cpython-312.pyc differ diff --git a/transformers/src/transformers/__pycache__/tokenization_utils_fast.cpython-312.pyc b/transformers/src/transformers/__pycache__/tokenization_utils_fast.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6813d86ecf1a5a436725dd26110831a889dd7756 Binary files /dev/null and b/transformers/src/transformers/__pycache__/tokenization_utils_fast.cpython-312.pyc differ diff --git a/transformers/src/transformers/__pycache__/trainer_callback.cpython-312.pyc b/transformers/src/transformers/__pycache__/trainer_callback.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d68d28ad527f177a5a8e681074e6aafdb48c39f5 Binary files /dev/null and b/transformers/src/transformers/__pycache__/trainer_callback.cpython-312.pyc differ diff --git a/transformers/src/transformers/__pycache__/trainer_pt_utils.cpython-312.pyc b/transformers/src/transformers/__pycache__/trainer_pt_utils.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..cd13eaeb3e3fd05118ad413e0096e1864faf63b4 Binary files /dev/null and b/transformers/src/transformers/__pycache__/trainer_pt_utils.cpython-312.pyc differ diff --git a/transformers/src/transformers/__pycache__/trainer_utils.cpython-312.pyc b/transformers/src/transformers/__pycache__/trainer_utils.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..fed4b1e41c16482c55f8e8963db17efa315372b5 Binary files /dev/null and b/transformers/src/transformers/__pycache__/trainer_utils.cpython-312.pyc differ diff --git a/transformers/src/transformers/__pycache__/video_processing_utils.cpython-312.pyc b/transformers/src/transformers/__pycache__/video_processing_utils.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9aee8b086a446af6aecc0e92f3653df51159f3de Binary files /dev/null and b/transformers/src/transformers/__pycache__/video_processing_utils.cpython-312.pyc differ diff --git a/transformers/src/transformers/__pycache__/video_utils.cpython-312.pyc b/transformers/src/transformers/__pycache__/video_utils.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..721031d0af12c0016fe1f58b927063a6e752ba60 Binary files /dev/null and b/transformers/src/transformers/__pycache__/video_utils.cpython-312.pyc differ diff --git a/transformers/src/transformers/commands/__init__.py b/transformers/src/transformers/commands/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..aa5d95a85b538171ec9cf4fa16e892df1efdef6b --- /dev/null +++ b/transformers/src/transformers/commands/__init__.py @@ -0,0 +1,27 @@ +# Copyright 2020 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from abc import ABC, abstractmethod +from argparse import ArgumentParser + + +class BaseTransformersCLICommand(ABC): + @staticmethod + @abstractmethod + def register_subcommand(parser: ArgumentParser): + raise NotImplementedError() + + @abstractmethod + def run(self): + raise NotImplementedError() diff --git a/transformers/src/transformers/commands/add_fast_image_processor.py b/transformers/src/transformers/commands/add_fast_image_processor.py new file mode 100644 index 0000000000000000000000000000000000000000..90c911525f7773a34ca9f97f180d5b127649f3c2 --- /dev/null +++ b/transformers/src/transformers/commands/add_fast_image_processor.py @@ -0,0 +1,530 @@ +# Copyright 2024 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import re +from argparse import ArgumentParser, Namespace +from datetime import date +from pathlib import Path + +from ..utils import logging +from . import BaseTransformersCLICommand + + +logger = logging.get_logger(__name__) # pylint: disable=invalid-name + + +CURRENT_YEAR = date.today().year +TRANSFORMERS_PATH = Path(__file__).parent.parent +REPO_PATH = TRANSFORMERS_PATH.parent.parent + + +def add_fast_image_processor_to_model_init( + fast_image_processing_module_file: str, fast_image_processor_name, model_name: str +): + """ + Add the fast image processor to the __init__.py file of the model. + """ + with open(TRANSFORMERS_PATH / "models" / model_name / "__init__.py", "r", encoding="utf-8") as f: + content = f.read() + + fast_image_processing_module_file = fast_image_processing_module_file.split(os.sep)[-1].replace(".py", "") + + if "import *" in content: + # we have an init file in the updated format + # get the indented block after if TYPE_CHECKING: and before else:, append the new import, sort the imports and write the updated content + # Step 1: Find the block + block_regex = re.compile( + r"if TYPE_CHECKING:\n(?P.*?)(?=\s*else:)", + re.DOTALL, + ) + match = block_regex.search(content) + + if not match: + raise ValueError("Couldn't find the 'if TYPE_CHECKING' block.") + + block_content = match.group("if_block") # The captured import block + + # Step 2: Parse existing entries + entries = block_content.split("\n") + indent = " " * (len(entries[0]) - len(entries[0].lstrip())) + new_entry = f"{indent}from .{fast_image_processing_module_file} import *" + if new_entry not in entries: + entries.append(new_entry) + entries.sort() + updated_block = "\n".join(entry for entry in entries) + + # Replace the original block in the content + updated_content = content[: match.start("if_block")] + updated_block + content[match.end("if_block") :] + else: + # we have an init file in the old format + + # add "is_torchvision_available" import to from ...utils import ( + # Regex to match import statements from transformers.utils + pattern = r""" + from\s+\.\.\.utils\s+import\s+ + (?: # Non-capturing group for either: + ([\w, ]+) # 1. Single-line imports (e.g., 'a, b') + | # OR + \((.*?)\) # 2. Multi-line imports (e.g., '(a, ... b)') + ) + """ + regex = re.compile(pattern, re.VERBOSE | re.DOTALL) + + def replacement_function(match): + # Extract existing imports + imports = (match.group(1) or match.group(2)).split(",") + imports = imports[:-1] if imports[-1] == "\n" else imports + imports = [imp.strip() for imp in imports] + + # Add the new import if not already present + if "is_torchvision_available" not in imports: + imports.append("is_torchvision_available") + imports.sort() + + # Convert to multi-line import in all cases + updated_imports = "(\n " + ",\n ".join(imports) + ",\n)" + + return f"from ...utils import {updated_imports}" + + # Replace all matches in the file content + updated_content = regex.sub(replacement_function, content) + + vision_import_structure_block = f' _import_structure["{fast_image_processing_module_file[:-5]}"] = ["{fast_image_processor_name[:-4]}"]\n' + + added_import_structure_block = ( + "try:\n if not is_torchvision_available():\n" + " raise OptionalDependencyNotAvailable()\n" + "except OptionalDependencyNotAvailable:\n" + " pass\n" + "else:\n" + f' _import_structure["{fast_image_processing_module_file}"] = ["{fast_image_processor_name}"]\n' + ) + + if vision_import_structure_block not in updated_content: + raise ValueError("Couldn't find the 'vision _import_structure block' block.") + + if added_import_structure_block not in updated_content: + updated_content = updated_content.replace( + vision_import_structure_block, vision_import_structure_block + "\n" + added_import_structure_block + ) + + vision_import_statement_block = ( + f" from .{fast_image_processing_module_file[:-5]} import {fast_image_processor_name[:-4]}\n" + ) + + added_import_statement_block = ( + " try:\n if not is_torchvision_available():\n" + " raise OptionalDependencyNotAvailable()\n" + " except OptionalDependencyNotAvailable:\n" + " pass\n" + " else:\n" + f" from .{fast_image_processing_module_file} import {fast_image_processor_name}\n" + ) + + if vision_import_statement_block not in updated_content: + raise ValueError("Couldn't find the 'vision _import_structure block' block.") + + if added_import_statement_block not in updated_content: + updated_content = updated_content.replace( + vision_import_statement_block, vision_import_statement_block + "\n" + added_import_statement_block + ) + + # write the updated content + with open(TRANSFORMERS_PATH / "models" / model_name / "__init__.py", "w", encoding="utf-8") as f: + f.write(updated_content) + + +def add_fast_image_processor_to_auto(image_processor_name: str, fast_image_processor_name: str): + """ + Add the fast image processor to the auto module. + """ + with open(TRANSFORMERS_PATH / "models" / "auto" / "image_processing_auto.py", "r", encoding="utf-8") as f: + content = f.read() + + # get all lines containing the image processor name + updated_content = content.replace( + f'("{image_processor_name}",)', f'("{image_processor_name}", "{fast_image_processor_name}")' + ) + + # write the updated content + with open(TRANSFORMERS_PATH / "models" / "auto" / "image_processing_auto.py", "w", encoding="utf-8") as f: + f.write(updated_content) + + +def add_fast_image_processor_to_doc(fast_image_processor_name: str, model_name: str): + """ + Add the fast image processor to the model's doc file. + """ + doc_source = REPO_PATH / "docs" / "source" + # find the doc files + doc_files = list(doc_source.glob(f"*/model_doc/{model_name}.md")) + if not doc_files: + # try again with "-" + doc_files = list(doc_source.glob(f"*/model_doc/{model_name.replace('_', '-')}.md")) + if not doc_files: + raise ValueError(f"No doc files found for {model_name}") + + base_doc_string = ( + f"## {fast_image_processor_name[:-4]}\n\n[[autodoc]] {fast_image_processor_name[:-4]}\n - preprocess" + ) + fast_doc_string = f"## {fast_image_processor_name}\n\n[[autodoc]] {fast_image_processor_name}\n - preprocess" + + for doc_file in doc_files: + with open(doc_file, "r", encoding="utf-8") as f: + content = f.read() + + if fast_doc_string not in content: + # add the fast image processor to the doc + updated_content = content.replace( + base_doc_string, + base_doc_string + "\n\n" + fast_doc_string, + ) + + # write the updated content + with open(doc_file, "w", encoding="utf-8") as f: + f.write(updated_content) + + +def add_fast_image_processor_to_tests(fast_image_processor_name: str, model_name: str): + """ + Add the fast image processor to the image processing tests. + """ + tests_path = REPO_PATH / "tests" / "models" / model_name + test_file = tests_path / f"test_image_processing_{model_name}.py" + if not os.path.exists(test_file): + logger.warning(f"No test file found for {model_name}. Skipping.") + return + + with open(test_file, "r", encoding="utf-8") as f: + content = f.read() + + # add is_torchvision_available import to the imports + # Regex to match import statements from transformers.utils + pattern = r""" + from\s+transformers\.utils\s+import\s+ + (?: # Non-capturing group for either: + ([\w, ]+) # 1. Single-line imports (e.g., 'a, b') + | # OR + \((.*?)\) # 2. Multi-line imports (e.g., '(a, ... b)') + ) + """ + regex = re.compile(pattern, re.VERBOSE | re.DOTALL) + + def replacement_function(match): + # Extract existing imports + existing_imports = (match.group(1) or match.group(2)).split(",") + existing_imports = existing_imports[:-1] if existing_imports[-1] == "\n" else existing_imports + existing_imports = [imp.strip() for imp in existing_imports] + + # Add the new import if not already present + if "is_torchvision_available" not in existing_imports: + existing_imports.append("is_torchvision_available") + existing_imports.sort() + + # Rebuild the import statement + if match.group(1): # Single-line import + updated_imports = ", ".join(existing_imports) + else: # Multi-line import + updated_imports = "(\n " + ",\n ".join(existing_imports) + ",\n)" + + return f"from transformers.utils import {updated_imports}" + + # Replace all matches in the file content + updated_content = regex.sub(replacement_function, content) + + # add the fast image processor to the imports + base_import_string = f" from transformers import {fast_image_processor_name[:-4]}" + fast_import_string = ( + f" if is_torchvision_available():\n from transformers import {fast_image_processor_name}" + ) + if fast_import_string not in updated_content: + updated_content = updated_content.replace(base_import_string, base_import_string + "\n\n" + fast_import_string) + + # get line starting with " image_processing_class = " and add a line after it starting with " fast_image_processing_class = " + image_processing_class_line = re.search(r" image_processing_class = .*", updated_content) + if not image_processing_class_line: + logger.warning(f"Couldn't find the 'image_processing_class' line in {test_file}. Skipping.") + return + + fast_image_processing_class_line = ( + f" fast_image_processing_class = {fast_image_processor_name} if is_torchvision_available() else None" + ) + if " fast_image_processing_class = " not in updated_content: + updated_content = updated_content.replace( + image_processing_class_line.group(0), + image_processing_class_line.group(0) + "\n" + fast_image_processing_class_line, + ) + + # write the updated content + with open(test_file, "w", encoding="utf-8") as f: + f.write(updated_content) + + +def get_fast_image_processing_content_header(content: str) -> str: + """ + Get the header of the slow image processor file. + """ + # get all the commented lines at the beginning of the file + content_header = re.search(r"^# coding=utf-8\n(#[^\n]*\n)*", content, re.MULTILINE) + if not content_header: + logger.warning("Couldn't find the content header in the slow image processor file. Using a default header.") + return ( + f"# coding=utf-8\n" + f"# Copyright {CURRENT_YEAR} The HuggingFace Team. All rights reserved.\n" + f"#\n" + f'# Licensed under the Apache License, Version 2.0 (the "License");\n' + f"# you may not use this file except in compliance with the License.\n" + f"# You may obtain a copy of the License at\n" + f"#\n" + f"# http://www.apache.org/licenses/LICENSE-2.0\n" + f"#\n" + f"# Unless required by applicable law or agreed to in writing, software\n" + f'# distributed under the License is distributed on an "AS IS" BASIS,\n' + f"# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n" + f"# See the License for the specific language governing permissions and\n" + f"# limitations under the License.\n" + f"\n" + ) + content_header = content_header.group(0) + # replace the year in the copyright + content_header = re.sub(r"# Copyright (\d+)\s", f"# Copyright {CURRENT_YEAR} ", content_header) + # get the line starting with """Image processor in content if it exists + match = re.search(r'^"""Image processor.*$', content, re.MULTILINE) + if match: + content_header += match.group(0).replace("Image processor", "Fast Image processor") + + return content_header + + +def write_default_fast_image_processor_file( + fast_image_processing_module_file: str, fast_image_processor_name: str, content_base_file: str +): + """ + Write a default fast image processor file. Used when encountering a problem while parsing the slow image processor file. + """ + imports = "\n\nfrom ...image_processing_utils_fast import BaseImageProcessorFast\n\n\n" + content_header = get_fast_image_processing_content_header(content_base_file) + content_base_file = ( + f"class {fast_image_processor_name}(BaseImageProcessorFast):\n" + " # To be implemented\n" + " resample = None\n" + " image_mean = None\n" + " image_std = None\n" + " size = None\n" + " default_to_square = None\n" + " crop_size = None\n" + " do_resize = None\n" + " do_center_crop = None\n" + " do_rescale = None\n" + " do_normalize = None\n" + " do_convert_rgb = None\n\n\n" + f'__all__ = ["{fast_image_processor_name}"]\n' + ) + + content = content_header + imports + content_base_file + + with open(fast_image_processing_module_file, "w", encoding="utf-8") as f: + f.write(content) + + +def add_fast_image_processor_file( + fast_image_processing_module_file: str, fast_image_processor_name: str, content_base_file: str +): + """ + Add the fast image processor file to the model's folder. + """ + # if the file already exists, do nothing + if os.path.exists(fast_image_processing_module_file): + print(f"{fast_image_processing_module_file} already exists. Skipping.") + return + + regex = rf"class {fast_image_processor_name[:-4]}.*?(\n\S|$)" + match = re.search(regex, content_base_file, re.DOTALL) + if not match: + print(f"Couldn't find the {fast_image_processor_name[:-4]} class in {fast_image_processing_module_file}") + print("Creating a new file with the default content.") + return write_default_fast_image_processor_file( + fast_image_processing_module_file, fast_image_processor_name, content_base_file + ) + # Exclude the last unindented line + slow_class_content = match.group(0).rstrip() + # get default args: + # find the __init__ block which start with def __init__ and ends with def + match = re.search(r"def __init__.*?def ", slow_class_content, re.DOTALL) + if not match: + print( + f"Couldn't find the __init__ block for {fast_image_processor_name[:-4]} in {fast_image_processing_module_file}" + ) + print("Creating a new file with the default content.") + return write_default_fast_image_processor_file( + fast_image_processing_module_file, fast_image_processor_name, content_base_file + ) + init = match.group(0) + init_signature_block = init.split(")")[0] + arg_names = init_signature_block.split(":") + arg_names = [arg_name.split("\n")[-1].strip() for arg_name in arg_names] + # get the default values + default_args = re.findall(r"= (.*?)(?:,|\))", init_signature_block) + + # build default args dict + default_args_dict = dict(zip(arg_names, default_args)) + pattern_default_size = r"size = size if size is not None else\s+(.*)" + match_default_size = re.findall(pattern_default_size, init) + default_args_dict["size"] = match_default_size[0] if match_default_size else None + pattern_default_crop_size = r"crop_size = crop_size if crop_size is not None else\s+(.*)" + match_default_crop_size = re.findall(pattern_default_crop_size, init) + default_args_dict["crop_size"] = match_default_crop_size[0] if match_default_crop_size else None + pattern_default_image_mean = r"self.image_mean = image_mean if image_mean is not None else\s+(.*)" + match_default_image_mean = re.findall(pattern_default_image_mean, init) + default_args_dict["image_mean"] = match_default_image_mean[0] if match_default_image_mean else None + pattern_default_image_std = r"self.image_std = image_std if image_std is not None else\s+(.*)" + match_default_image_std = re.findall(pattern_default_image_std, init) + default_args_dict["image_std"] = match_default_image_std[0] if match_default_image_std else None + default_args_dict["default_to_square"] = False if "(size, default_to_square=False" in init else None + + content_header = get_fast_image_processing_content_header(content_base_file) + content_base_file = ( + f"@auto_docstring\n" + f"class {fast_image_processor_name}(BaseImageProcessorFast):\n" + " # This generated class can be used as a starting point for the fast image processor.\n" + " # if the image processor is only used for simple augmentations, such as resizing, center cropping, rescaling, or normalizing,\n" + " # only the default values should be set in the class.\n" + " # If the image processor requires more complex augmentations, methods from BaseImageProcessorFast can be overridden.\n" + " # In most cases, only the `_preprocess` method should be overridden.\n\n" + " # For an example of a fast image processor requiring more complex augmentations, see `LlavaNextImageProcessorFast`.\n\n" + " # Default values should be checked against the slow image processor\n" + " # None values left after checking can be removed\n" + f" resample = {default_args_dict.get('resample')}\n" + f" image_mean = {default_args_dict.get('image_mean')}\n" + f" image_std = {default_args_dict.get('image_std')}\n" + f" size = {default_args_dict.get('size')}\n" + f" default_to_square = {default_args_dict.get('default_to_square')}\n" + f" crop_size = {default_args_dict.get('crop_size')}\n" + f" do_resize = {default_args_dict.get('do_resize')}\n" + f" do_center_crop = {default_args_dict.get('do_center_crop')}\n" + f" do_rescale = {default_args_dict.get('do_rescale')}\n" + f" do_normalize = {default_args_dict.get('do_normalize')}\n" + f" do_convert_rgb = {default_args_dict.get('do_convert_rgb')}\n\n\n" + f'__all__ = ["{fast_image_processor_name}"]\n' + ) + + imports = "\n\nfrom ...image_processing_utils_fast import BaseImageProcessorFast\n" + image_utils_imports = [] + if default_args_dict.get("resample") is not None and "PILImageResampling" in default_args_dict.get("resample"): + image_utils_imports.append("PILImageResampling") + if default_args_dict.get("image_mean") is not None and not any( + char.isdigit() for char in default_args_dict.get("image_mean") + ): + image_utils_imports.append(default_args_dict.get("image_mean")) + if default_args_dict.get("image_std") is not None and not any( + char.isdigit() for char in default_args_dict.get("image_std") + ): + image_utils_imports.append(default_args_dict.get("image_std")) + + if image_utils_imports: + # sort imports + image_utils_imports.sort() + imports += f"from ...image_utils import {', '.join(image_utils_imports)}\n" + + imports += "from ...utils import auto_docstring\n" + + content = content_header + imports + "\n\n" + content_base_file + + with open(fast_image_processing_module_file, "w", encoding="utf-8") as f: + f.write(content) + + +def add_fast_image_processor(model_name: str): + """ + Add the necessary references to the fast image processor in the transformers package, + and create the fast image processor file in the model's folder. + """ + model_module = TRANSFORMERS_PATH / "models" / model_name + image_processing_module_file = list(model_module.glob("image_processing*.py")) + if not image_processing_module_file: + raise ValueError(f"No image processing module found in {model_module}") + elif len(image_processing_module_file) > 1: + for file_name in image_processing_module_file: + if not str(file_name).endswith("_fast.py"): + image_processing_module_file = str(file_name) + break + else: + image_processing_module_file = str(image_processing_module_file[0]) + + with open(image_processing_module_file, "r", encoding="utf-8") as f: + content_base_file = f.read() + + # regex to find object starting with "class " and ending with "ImageProcessor", including "ImageProcessor" in the match + image_processor_name = re.findall(r"class (\w*ImageProcessor)", content_base_file) + if not image_processor_name: + raise ValueError(f"No ImageProcessor class found in {image_processing_module_file}") + elif len(image_processor_name) > 1: + raise ValueError(f"Multiple ImageProcessor classes found in {image_processing_module_file}") + + image_processor_name = image_processor_name[0] + fast_image_processor_name = image_processor_name + "Fast" + fast_image_processing_module_file = image_processing_module_file.replace(".py", "_fast.py") + + print(f"Adding {fast_image_processor_name} to {fast_image_processing_module_file}") + + add_fast_image_processor_to_model_init( + fast_image_processing_module_file=fast_image_processing_module_file, + fast_image_processor_name=fast_image_processor_name, + model_name=model_name, + ) + + add_fast_image_processor_to_auto( + image_processor_name=image_processor_name, + fast_image_processor_name=fast_image_processor_name, + ) + + add_fast_image_processor_to_doc( + fast_image_processor_name=fast_image_processor_name, + model_name=model_name, + ) + + add_fast_image_processor_to_tests( + fast_image_processor_name=fast_image_processor_name, + model_name=model_name, + ) + + add_fast_image_processor_file( + fast_image_processing_module_file=fast_image_processing_module_file, + fast_image_processor_name=fast_image_processor_name, + content_base_file=content_base_file, + ) + + +def add_new_model_like_command_factory(args: Namespace): + return AddFastImageProcessorCommand(model_name=args.model_name) + + +class AddFastImageProcessorCommand(BaseTransformersCLICommand): + @staticmethod + def register_subcommand(parser: ArgumentParser): + add_fast_image_processor_parser = parser.add_parser("add-fast-image-processor") + add_fast_image_processor_parser.add_argument( + "--model-name", + type=str, + required=True, + help="The name of the folder containing the model's implementation.", + ) + add_fast_image_processor_parser.set_defaults(func=add_new_model_like_command_factory) + + def __init__(self, model_name: str, *args): + self.model_name = model_name + + def run(self): + add_fast_image_processor(model_name=self.model_name) diff --git a/transformers/src/transformers/commands/add_new_model_like.py b/transformers/src/transformers/commands/add_new_model_like.py new file mode 100644 index 0000000000000000000000000000000000000000..a38f0f317dc95ef00ef5294308b1e9ad18f30c9f --- /dev/null +++ b/transformers/src/transformers/commands/add_new_model_like.py @@ -0,0 +1,1803 @@ +# Copyright 2021 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# 1. Standard library +import difflib +import json +import os +import re +from argparse import ArgumentParser, Namespace +from dataclasses import dataclass +from datetime import date +from itertools import chain +from pathlib import Path +from re import Pattern +from typing import Any, Callable, Optional, Union + +import yaml + +from ..models import auto as auto_module +from ..models.auto.configuration_auto import model_type_to_module_name +from ..utils import ( + is_flax_available, + is_tf_available, + is_torch_available, + logging, +) +from . import BaseTransformersCLICommand +from .add_fast_image_processor import add_fast_image_processor + + +logger = logging.get_logger(__name__) # pylint: disable=invalid-name + + +CURRENT_YEAR = date.today().year +TRANSFORMERS_PATH = Path(__file__).parent.parent +REPO_PATH = TRANSFORMERS_PATH.parent.parent + + +@dataclass +class ModelPatterns: + """ + Holds the basic information about a new model for the add-new-model-like command. + + Args: + model_name (`str`): The model name. + checkpoint (`str`): The checkpoint to use for doc examples. + model_type (`str`, *optional*): + The model type, the identifier used internally in the library like `bert` or `xlm-roberta`. Will default to + `model_name` lowercased with spaces replaced with minuses (-). + model_lower_cased (`str`, *optional*): + The lowercased version of the model name, to use for the module name or function names. Will default to + `model_name` lowercased with spaces and minuses replaced with underscores. + model_camel_cased (`str`, *optional*): + The camel-cased version of the model name, to use for the class names. Will default to `model_name` + camel-cased (with spaces and minuses both considered as word separators. + model_upper_cased (`str`, *optional*): + The uppercased version of the model name, to use for the constant names. Will default to `model_name` + uppercased with spaces and minuses replaced with underscores. + config_class (`str`, *optional*): + The tokenizer class associated with this model. Will default to `"{model_camel_cased}Config"`. + tokenizer_class (`str`, *optional*): + The tokenizer class associated with this model (leave to `None` for models that don't use a tokenizer). + image_processor_class (`str`, *optional*): + The image processor class associated with this model (leave to `None` for models that don't use an image + processor). + image_processor_fast_class (`str`, *optional*): + The fast image processor class associated with this model (leave to `None` for models that don't use a fast + image processor). + feature_extractor_class (`str`, *optional*): + The feature extractor class associated with this model (leave to `None` for models that don't use a feature + extractor). + processor_class (`str`, *optional*): + The processor class associated with this model (leave to `None` for models that don't use a processor). + """ + + model_name: str + checkpoint: str + model_type: Optional[str] = None + model_lower_cased: Optional[str] = None + model_camel_cased: Optional[str] = None + model_upper_cased: Optional[str] = None + config_class: Optional[str] = None + tokenizer_class: Optional[str] = None + image_processor_class: Optional[str] = None + image_processor_fast_class: Optional[str] = None + feature_extractor_class: Optional[str] = None + processor_class: Optional[str] = None + + def __post_init__(self): + if self.model_type is None: + self.model_type = self.model_name.lower().replace(" ", "-") + if self.model_lower_cased is None: + self.model_lower_cased = self.model_name.lower().replace(" ", "_").replace("-", "_") + if self.model_camel_cased is None: + # Split the model name on - and space + words = self.model_name.split(" ") + words = list(chain(*[w.split("-") for w in words])) + # Make sure each word is capitalized + words = [w[0].upper() + w[1:] for w in words] + self.model_camel_cased = "".join(words) + if self.model_upper_cased is None: + self.model_upper_cased = self.model_name.upper().replace(" ", "_").replace("-", "_") + if self.config_class is None: + self.config_class = f"{self.model_camel_cased}Config" + + +ATTRIBUTE_TO_PLACEHOLDER = { + "config_class": "[CONFIG_CLASS]", + "tokenizer_class": "[TOKENIZER_CLASS]", + "image_processor_class": "[IMAGE_PROCESSOR_CLASS]", + "image_processor_fast_class": "[IMAGE_PROCESSOR_FAST_CLASS]", + "feature_extractor_class": "[FEATURE_EXTRACTOR_CLASS]", + "processor_class": "[PROCESSOR_CLASS]", + "checkpoint": "[CHECKPOINT]", + "model_type": "[MODEL_TYPE]", + "model_upper_cased": "[MODEL_UPPER_CASED]", + "model_camel_cased": "[MODEL_CAMELCASED]", + "model_lower_cased": "[MODEL_LOWER_CASED]", + "model_name": "[MODEL_NAME]", +} + + +def is_empty_line(line: str) -> bool: + """ + Determines whether a line is empty or not. + """ + return len(line) == 0 or line.isspace() + + +def find_indent(line: str) -> int: + """ + Returns the number of spaces that start a line indent. + """ + search = re.search(r"^(\s*)(?:\S|$)", line) + if search is None: + return 0 + return len(search.groups()[0]) + + +def parse_module_content(content: str) -> list[str]: + """ + Parse the content of a module in the list of objects it defines. + + Args: + content (`str`): The content to parse + + Returns: + `list[str]`: The list of objects defined in the module. + """ + objects = [] + current_object = [] + lines = content.split("\n") + # Doc-styler takes everything between two triple quotes in docstrings, so we need a fake """ here to go with this. + end_markers = [")", "]", "}", '"""'] + + for line in lines: + # End of an object + is_valid_object = len(current_object) > 0 + if is_valid_object and len(current_object) == 1: + is_valid_object = not current_object[0].startswith("# Copied from") + if not is_empty_line(line) and find_indent(line) == 0 and is_valid_object: + # Closing parts should be included in current object + if line in end_markers: + current_object.append(line) + objects.append("\n".join(current_object)) + current_object = [] + else: + objects.append("\n".join(current_object)) + current_object = [line] + else: + current_object.append(line) + + # Add last object + if len(current_object) > 0: + objects.append("\n".join(current_object)) + + return objects + + +def extract_block(content: str, indent_level: int = 0) -> str: + """Return the first block in `content` with the indent level `indent_level`. + + The first line in `content` should be indented at `indent_level` level, otherwise an error will be thrown. + + This method will immediately stop the search when a (non-empty) line with indent level less than `indent_level` is + encountered. + + Args: + content (`str`): The content to parse + indent_level (`int`, *optional*, default to 0): The indent level of the blocks to search for + + Returns: + `str`: The first block in `content` with the indent level `indent_level`. + """ + current_object = [] + lines = content.split("\n") + # Doc-styler takes everything between two triple quotes in docstrings, so we need a fake """ here to go with this. + end_markers = [")", "]", "}", '"""'] + + for idx, line in enumerate(lines): + if idx == 0 and indent_level > 0 and not is_empty_line(line) and find_indent(line) != indent_level: + raise ValueError( + f"When `indent_level > 0`, the first line in `content` should have indent level {indent_level}. Got " + f"{find_indent(line)} instead." + ) + + if find_indent(line) < indent_level and not is_empty_line(line): + break + + # End of an object + is_valid_object = len(current_object) > 0 + if ( + not is_empty_line(line) + and not line.endswith(":") + and find_indent(line) == indent_level + and is_valid_object + ): + # Closing parts should be included in current object + if line.lstrip() in end_markers: + current_object.append(line) + return "\n".join(current_object) + else: + current_object.append(line) + + # Add last object + if len(current_object) > 0: + return "\n".join(current_object) + + +def add_content_to_text( + text: str, + content: str, + add_after: Optional[Union[str, Pattern]] = None, + add_before: Optional[Union[str, Pattern]] = None, + exact_match: bool = False, +) -> str: + """ + A utility to add some content inside a given text. + + Args: + text (`str`): The text in which we want to insert some content. + content (`str`): The content to add. + add_after (`str` or `Pattern`): + The pattern to test on a line of `text`, the new content is added after the first instance matching it. + add_before (`str` or `Pattern`): + The pattern to test on a line of `text`, the new content is added before the first instance matching it. + exact_match (`bool`, *optional*, defaults to `False`): + A line is considered a match with `add_after` or `add_before` if it matches exactly when `exact_match=True`, + otherwise, if `add_after`/`add_before` is present in the line. + + + + The arguments `add_after` and `add_before` are mutually exclusive, and one exactly needs to be provided. + + + + Returns: + `str`: The text with the new content added if a match was found. + """ + if add_after is None and add_before is None: + raise ValueError("You need to pass either `add_after` or `add_before`") + if add_after is not None and add_before is not None: + raise ValueError("You can't pass both `add_after` or `add_before`") + pattern = add_after if add_before is None else add_before + + def this_is_the_line(line): + if isinstance(pattern, Pattern): + return pattern.search(line) is not None + elif exact_match: + return pattern == line + else: + return pattern in line + + new_lines = [] + for line in text.split("\n"): + if this_is_the_line(line): + if add_before is not None: + new_lines.append(content) + new_lines.append(line) + if add_after is not None: + new_lines.append(content) + else: + new_lines.append(line) + + return "\n".join(new_lines) + + +def add_content_to_file( + file_name: Union[str, os.PathLike], + content: str, + add_after: Optional[Union[str, Pattern]] = None, + add_before: Optional[Union[str, Pattern]] = None, + exact_match: bool = False, +): + """ + A utility to add some content inside a given file. + + Args: + file_name (`str` or `os.PathLike`): The name of the file in which we want to insert some content. + content (`str`): The content to add. + add_after (`str` or `Pattern`): + The pattern to test on a line of `text`, the new content is added after the first instance matching it. + add_before (`str` or `Pattern`): + The pattern to test on a line of `text`, the new content is added before the first instance matching it. + exact_match (`bool`, *optional*, defaults to `False`): + A line is considered a match with `add_after` or `add_before` if it matches exactly when `exact_match=True`, + otherwise, if `add_after`/`add_before` is present in the line. + + + + The arguments `add_after` and `add_before` are mutually exclusive, and one exactly needs to be provided. + + + """ + with open(file_name, "r", encoding="utf-8") as f: + old_content = f.read() + + new_content = add_content_to_text( + old_content, content, add_after=add_after, add_before=add_before, exact_match=exact_match + ) + + with open(file_name, "w", encoding="utf-8") as f: + f.write(new_content) + + +def replace_model_patterns( + text: str, old_model_patterns: ModelPatterns, new_model_patterns: ModelPatterns +) -> tuple[str, str]: + """ + Replace all patterns present in a given text. + + Args: + text (`str`): The text to treat. + old_model_patterns (`ModelPatterns`): The patterns for the old model. + new_model_patterns (`ModelPatterns`): The patterns for the new model. + + Returns: + `Tuple(str, str)`: A tuple of with the treated text and the replacement actually done in it. + """ + # The order is crucially important as we will check and replace in that order. For instance the config probably + # contains the camel-cased named, but will be treated before. + attributes_to_check = ["config_class"] + # Add relevant preprocessing classes + for attr in [ + "tokenizer_class", + "image_processor_class", + "image_processor_fast_class", + "feature_extractor_class", + "processor_class", + ]: + if getattr(old_model_patterns, attr) is not None and getattr(new_model_patterns, attr) is not None: + attributes_to_check.append(attr) + + # Special cases for checkpoint and model_type + if old_model_patterns.checkpoint not in [old_model_patterns.model_type, old_model_patterns.model_lower_cased]: + attributes_to_check.append("checkpoint") + if old_model_patterns.model_type != old_model_patterns.model_lower_cased: + attributes_to_check.append("model_type") + else: + text = re.sub( + rf'(\s*)model_type = "{old_model_patterns.model_type}"', + r'\1model_type = "[MODEL_TYPE]"', + text, + ) + + # Special case when the model camel cased and upper cased names are the same for the old model (like for GPT2) but + # not the new one. We can't just do a replace in all the text and will need a special regex + if old_model_patterns.model_upper_cased == old_model_patterns.model_camel_cased: + old_model_value = old_model_patterns.model_upper_cased + if re.search(rf"{old_model_value}_[A-Z_]*[^A-Z_]", text) is not None: + text = re.sub(rf"{old_model_value}([A-Z_]*)([^a-zA-Z_])", r"[MODEL_UPPER_CASED]\1\2", text) + else: + attributes_to_check.append("model_upper_cased") + + attributes_to_check.extend(["model_camel_cased", "model_lower_cased", "model_name"]) + + # Now let's replace every other attribute by their placeholder + for attr in attributes_to_check: + text = text.replace(getattr(old_model_patterns, attr), ATTRIBUTE_TO_PLACEHOLDER[attr]) + + # Finally we can replace the placeholder byt the new values. + replacements = [] + for attr, placeholder in ATTRIBUTE_TO_PLACEHOLDER.items(): + if placeholder in text: + replacements.append((getattr(old_model_patterns, attr), getattr(new_model_patterns, attr))) + text = text.replace(placeholder, getattr(new_model_patterns, attr)) + + # If we have two inconsistent replacements, we don't return anything (ex: GPT2->GPT_NEW and GPT2->GPTNew) + old_replacement_values = [old for old, new in replacements] + if len(set(old_replacement_values)) != len(old_replacement_values): + return text, "" + + replacements = simplify_replacements(replacements) + replacements = [f"{old}->{new}" for old, new in replacements] + return text, ",".join(replacements) + + +def simplify_replacements(replacements): + """ + Simplify a list of replacement patterns to make sure there are no needless ones. + + For instance in the sequence "Bert->BertNew, BertConfig->BertNewConfig, bert->bert_new", the replacement + "BertConfig->BertNewConfig" is implied by "Bert->BertNew" so not needed. + + Args: + replacements (`list[tuple[str, str]]`): List of patterns (old, new) + + Returns: + `list[tuple[str, str]]`: The list of patterns simplified. + """ + if len(replacements) <= 1: + # Nothing to simplify + return replacements + + # Next let's sort replacements by length as a replacement can only "imply" another replacement if it's shorter. + replacements.sort(key=lambda x: len(x[0])) + + idx = 0 + while idx < len(replacements): + old, new = replacements[idx] + # Loop through all replacements after + j = idx + 1 + while j < len(replacements): + old_2, new_2 = replacements[j] + # If the replacement is implied by the current one, we can drop it. + if old_2.replace(old, new) == new_2: + replacements.pop(j) + else: + j += 1 + idx += 1 + + return replacements + + +def get_module_from_file(module_file: Union[str, os.PathLike]) -> str: + """ + Returns the module name corresponding to a module file. + """ + full_module_path = Path(module_file).absolute() + module_parts = full_module_path.with_suffix("").parts + + # Find the first part named transformers, starting from the end. + idx = len(module_parts) - 1 + while idx >= 0 and module_parts[idx] != "transformers": + idx -= 1 + if idx < 0: + raise ValueError(f"{module_file} is not a transformers module.") + + return ".".join(module_parts[idx:]) + + +SPECIAL_PATTERNS = { + "_CHECKPOINT_FOR_DOC =": "checkpoint", + "_CONFIG_FOR_DOC =": "config_class", + "_TOKENIZER_FOR_DOC =": "tokenizer_class", + "_IMAGE_PROCESSOR_FOR_DOC =": "image_processor_class", + "_FEAT_EXTRACTOR_FOR_DOC =": "feature_extractor_class", + "_PROCESSOR_FOR_DOC =": "processor_class", +} + + +_re_class_func = re.compile(r"^(?:class|def)\s+([^\s:\(]+)\s*(?:\(|\:)", flags=re.MULTILINE) + + +def remove_attributes(obj, target_attr): + """Remove `target_attr` in `obj`.""" + lines = obj.split(os.linesep) + + target_idx = None + for idx, line in enumerate(lines): + # search for assignment + if line.lstrip().startswith(f"{target_attr} = "): + target_idx = idx + break + # search for function/method definition + elif line.lstrip().startswith(f"def {target_attr}("): + target_idx = idx + break + + # target not found + if target_idx is None: + return obj + + line = lines[target_idx] + indent_level = find_indent(line) + # forward pass to find the ending of the block (including empty lines) + parsed = extract_block("\n".join(lines[target_idx:]), indent_level) + num_lines = len(parsed.split("\n")) + for idx in range(num_lines): + lines[target_idx + idx] = None + + # backward pass to find comments or decorator + for idx in range(target_idx - 1, -1, -1): + line = lines[idx] + if (line.lstrip().startswith("#") or line.lstrip().startswith("@")) and find_indent(line) == indent_level: + lines[idx] = None + else: + break + + new_obj = os.linesep.join([x for x in lines if x is not None]) + + return new_obj + + +def duplicate_module( + module_file: Union[str, os.PathLike], + old_model_patterns: ModelPatterns, + new_model_patterns: ModelPatterns, + dest_file: Optional[str] = None, + add_copied_from: bool = True, + attrs_to_remove: Optional[list[str]] = None, +): + """ + Create a new module from an existing one and adapting all function and classes names from old patterns to new ones. + + Args: + module_file (`str` or `os.PathLike`): Path to the module to duplicate. + old_model_patterns (`ModelPatterns`): The patterns for the old model. + new_model_patterns (`ModelPatterns`): The patterns for the new model. + dest_file (`str` or `os.PathLike`, *optional*): Path to the new module. + add_copied_from (`bool`, *optional*, defaults to `True`): + Whether or not to add `# Copied from` statements in the duplicated module. + """ + if dest_file is None: + dest_file = str(module_file).replace( + old_model_patterns.model_lower_cased, new_model_patterns.model_lower_cased + ) + + with open(module_file, "r", encoding="utf-8") as f: + content = f.read() + + content = re.sub(r"# Copyright (\d+)\s", f"# Copyright {CURRENT_YEAR} ", content) + objects = parse_module_content(content) + + # Loop and treat all objects + new_objects = [] + for obj in objects: + special_pattern = False + for pattern, attr in SPECIAL_PATTERNS.items(): + if pattern in obj: + obj = obj.replace(getattr(old_model_patterns, attr), getattr(new_model_patterns, attr)) + new_objects.append(obj) + special_pattern = True + break + + if special_pattern: + continue + + # Regular classes functions + old_obj = obj + obj, replacement = replace_model_patterns(obj, old_model_patterns, new_model_patterns) + has_copied_from = re.search(r"^#\s+Copied from", obj, flags=re.MULTILINE) is not None + if add_copied_from and not has_copied_from and _re_class_func.search(obj) is not None and len(replacement) > 0: + # Copied from statement must be added just before the class/function definition, which may not be the + # first line because of decorators. + module_name = get_module_from_file(module_file) + old_object_name = _re_class_func.search(old_obj).groups()[0] + obj = add_content_to_text( + obj, f"# Copied from {module_name}.{old_object_name} with {replacement}", add_before=_re_class_func + ) + # In all cases, we remove Copied from statement with indent on methods. + obj = re.sub("\n[ ]+# Copied from [^\n]*\n", "\n", obj) + + new_objects.append(obj) + + content = "\n".join(new_objects) + # Remove some attributes that we don't want to copy to the new file(s) + if attrs_to_remove is not None: + for attr in attrs_to_remove: + content = remove_attributes(content, target_attr=attr) + + with open(dest_file, "w", encoding="utf-8") as f: + f.write(content) + + +def filter_framework_files( + files: list[Union[str, os.PathLike]], frameworks: Optional[list[str]] = None +) -> list[Union[str, os.PathLike]]: + """ + Filter a list of files to only keep the ones corresponding to a list of frameworks. + + Args: + files (`list[Union[str, os.PathLike]]`): The list of files to filter. + frameworks (`list[str]`, *optional*): The list of allowed frameworks. + + Returns: + `list[Union[str, os.PathLike]]`: The list of filtered files. + """ + if frameworks is None: + frameworks = get_default_frameworks() + + framework_to_file = {} + others = [] + for f in files: + parts = Path(f).name.split("_") + if "modeling" not in parts: + others.append(f) + continue + if "tf" in parts: + framework_to_file["tf"] = f + elif "flax" in parts: + framework_to_file["flax"] = f + else: + framework_to_file["pt"] = f + + return [framework_to_file[f] for f in frameworks if f in framework_to_file] + others + + +def get_model_files(model_type: str, frameworks: Optional[list[str]] = None) -> dict[str, Union[Path, list[Path]]]: + """ + Retrieves all the files associated to a model. + + Args: + model_type (`str`): A valid model type (like "bert" or "gpt2") + frameworks (`list[str]`, *optional*): + If passed, will only keep the model files corresponding to the passed frameworks. + + Returns: + `dict[str, Union[Path, list[Path]]]`: A dictionary with the following keys: + - **doc_file** -- The documentation file for the model. + - **model_files** -- All the files in the model module. + - **test_files** -- The test files for the model. + """ + module_name = model_type_to_module_name(model_type) + + model_module = TRANSFORMERS_PATH / "models" / module_name + model_files = list(model_module.glob("*.py")) + model_files = filter_framework_files(model_files, frameworks=frameworks) + + doc_file = REPO_PATH / "docs" / "source" / "en" / "model_doc" / f"{model_type}.md" + + # Basic pattern for test files + test_files = [ + f"test_modeling_{module_name}.py", + f"test_modeling_tf_{module_name}.py", + f"test_modeling_flax_{module_name}.py", + f"test_tokenization_{module_name}.py", + f"test_image_processing_{module_name}.py", + f"test_feature_extraction_{module_name}.py", + f"test_processor_{module_name}.py", + ] + test_files = filter_framework_files(test_files, frameworks=frameworks) + # Add the test directory + test_files = [REPO_PATH / "tests" / "models" / module_name / f for f in test_files] + # Filter by existing files + test_files = [f for f in test_files if f.exists()] + + return {"doc_file": doc_file, "model_files": model_files, "module_name": module_name, "test_files": test_files} + + +_re_checkpoint_in_config = re.compile(r"\[(.+?)\]\((https://huggingface\.co/.+?)\)") + + +def find_base_model_checkpoint( + model_type: str, model_files: Optional[dict[str, Union[Path, list[Path]]]] = None +) -> str: + """ + Finds the model checkpoint used in the docstrings for a given model. + + Args: + model_type (`str`): A valid model type (like "bert" or "gpt2") + model_files (`dict[str, Union[Path, list[Path]]`, *optional*): + The files associated to `model_type`. Can be passed to speed up the function, otherwise will be computed. + + Returns: + `str`: The checkpoint used. + """ + if model_files is None: + model_files = get_model_files(model_type) + module_files = model_files["model_files"] + for fname in module_files: + # After the @auto_docstring refactor, we expect the checkpoint to be in the configuration file's docstring + if "configuration" not in str(fname): + continue + + with open(fname, "r", encoding="utf-8") as f: + content = f.read() + if _re_checkpoint_in_config.search(content) is not None: + checkpoint = _re_checkpoint_in_config.search(content).groups()[0] + # Remove quotes + checkpoint = checkpoint.replace('"', "") + checkpoint = checkpoint.replace("'", "") + return checkpoint + + # TODO: Find some kind of fallback if there is no _CHECKPOINT_FOR_DOC in any of the modeling file. + return "" + + +def get_default_frameworks(): + """ + Returns the list of frameworks (PyTorch, TensorFlow, Flax) that are installed in the environment. + """ + frameworks = [] + if is_torch_available(): + frameworks.append("pt") + if is_tf_available(): + frameworks.append("tf") + if is_flax_available(): + frameworks.append("flax") + return frameworks + + +_re_model_mapping = re.compile("MODEL_([A-Z_]*)MAPPING_NAMES") + + +def retrieve_model_classes(model_type: str, frameworks: Optional[list[str]] = None) -> dict[str, list[str]]: + """ + Retrieve the model classes associated to a given model. + + Args: + model_type (`str`): A valid model type (like "bert" or "gpt2") + frameworks (`list[str]`, *optional*): + The frameworks to look for. Will default to `["pt", "tf", "flax"]`, passing a smaller list will restrict + the classes returned. + + Returns: + `dict[str, list[str]]`: A dictionary with one key per framework and the list of model classes associated to + that framework as values. + """ + if frameworks is None: + frameworks = get_default_frameworks() + + modules = { + "pt": auto_module.modeling_auto if is_torch_available() else None, + "tf": auto_module.modeling_tf_auto if is_tf_available() else None, + "flax": auto_module.modeling_flax_auto if is_flax_available() else None, + } + + model_classes = {} + for framework in frameworks: + new_model_classes = [] + if modules[framework] is None: + raise ValueError(f"You selected {framework} in the frameworks, but it is not installed.") + model_mappings = [attr for attr in dir(modules[framework]) if _re_model_mapping.search(attr) is not None] + for model_mapping_name in model_mappings: + model_mapping = getattr(modules[framework], model_mapping_name) + if model_type in model_mapping: + new_model_classes.append(model_mapping[model_type]) + + if len(new_model_classes) > 0: + # Remove duplicates + model_classes[framework] = list(set(new_model_classes)) + + return model_classes + + +def retrieve_info_for_model(model_type, frameworks: Optional[list[str]] = None): + """ + Retrieves all the information from a given model_type. + + Args: + model_type (`str`): A valid model type (like "bert" or "gpt2") + frameworks (`list[str]`, *optional*): + If passed, will only keep the info corresponding to the passed frameworks. + + Returns: + `Dict`: A dictionary with the following keys: + - **frameworks** (`list[str]`): The list of frameworks that back this model type. + - **model_classes** (`dict[str, list[str]]`): The model classes implemented for that model type. + - **model_files** (`dict[str, Union[Path, list[Path]]]`): The files associated with that model type. + - **model_patterns** (`ModelPatterns`): The various patterns for the model. + """ + if model_type not in auto_module.MODEL_NAMES_MAPPING: + raise ValueError(f"{model_type} is not a valid model type.") + + model_name = auto_module.MODEL_NAMES_MAPPING[model_type] + config_class = auto_module.configuration_auto.CONFIG_MAPPING_NAMES[model_type] + if model_type in auto_module.tokenization_auto.TOKENIZER_MAPPING_NAMES: + tokenizer_classes = auto_module.tokenization_auto.TOKENIZER_MAPPING_NAMES[model_type] + tokenizer_class = tokenizer_classes[0] if tokenizer_classes[0] is not None else tokenizer_classes[1] + else: + tokenizer_class = None + image_processor_classes = auto_module.image_processing_auto.IMAGE_PROCESSOR_MAPPING_NAMES.get(model_type, None) + if isinstance(image_processor_classes, tuple): + image_processor_class, image_processor_fast_class = image_processor_classes + else: + image_processor_class = image_processor_classes + image_processor_fast_class = None + feature_extractor_class = auto_module.feature_extraction_auto.FEATURE_EXTRACTOR_MAPPING_NAMES.get(model_type, None) + processor_class = auto_module.processing_auto.PROCESSOR_MAPPING_NAMES.get(model_type, None) + + model_files = get_model_files(model_type, frameworks=frameworks) + model_camel_cased = config_class.replace("Config", "") + + available_frameworks = [] + for fname in model_files["model_files"]: + if "modeling_tf" in str(fname): + available_frameworks.append("tf") + elif "modeling_flax" in str(fname): + available_frameworks.append("flax") + elif "modeling" in str(fname): + available_frameworks.append("pt") + + if frameworks is None: + frameworks = get_default_frameworks() + + frameworks = [f for f in frameworks if f in available_frameworks] + + model_classes = retrieve_model_classes(model_type, frameworks=frameworks) + + model_upper_cased = model_camel_cased.upper() + model_patterns = ModelPatterns( + model_name, + checkpoint=find_base_model_checkpoint(model_type, model_files=model_files), + model_type=model_type, + model_camel_cased=model_camel_cased, + model_lower_cased=model_files["module_name"], + model_upper_cased=model_upper_cased, + config_class=config_class, + tokenizer_class=tokenizer_class, + image_processor_class=image_processor_class, + image_processor_fast_class=image_processor_fast_class, + feature_extractor_class=feature_extractor_class, + processor_class=processor_class, + ) + + return { + "frameworks": frameworks, + "model_classes": model_classes, + "model_files": model_files, + "model_patterns": model_patterns, + } + + +def clean_frameworks_in_init( + init_file: Union[str, os.PathLike], frameworks: Optional[list[str]] = None, keep_processing: bool = True +): + """ + Removes all the import lines that don't belong to a given list of frameworks or concern tokenizers/feature + extractors/image processors/processors in an init. + + Args: + init_file (`str` or `os.PathLike`): The path to the init to treat. + frameworks (`list[str]`, *optional*): + If passed, this will remove all imports that are subject to a framework not in frameworks + keep_processing (`bool`, *optional*, defaults to `True`): + Whether or not to keep the preprocessing (tokenizer, feature extractor, image processor, processor) imports + in the init. + """ + if frameworks is None: + frameworks = get_default_frameworks() + + names = {"pt": "torch"} + to_remove = [names.get(f, f) for f in ["pt", "tf", "flax"] if f not in frameworks] + if not keep_processing: + to_remove.extend(["sentencepiece", "tokenizers", "vision"]) + + if len(to_remove) == 0: + # Nothing to do + return + + remove_pattern = "|".join(to_remove) + re_conditional_imports = re.compile(rf"^\s*if not is_({remove_pattern})_available\(\):\s*$") + re_try = re.compile(r"\s*try:") + re_else = re.compile(r"\s*else:") + re_is_xxx_available = re.compile(rf"is_({remove_pattern})_available") + + with open(init_file, "r", encoding="utf-8") as f: + content = f.read() + + lines = content.split("\n") + new_lines = [] + idx = 0 + while idx < len(lines): + # Conditional imports in try-except-else blocks + if (re_conditional_imports.search(lines[idx]) is not None) and (re_try.search(lines[idx - 1]) is not None): + # Remove the preceding `try:` + new_lines.pop() + idx += 1 + # Iterate until `else:` + while is_empty_line(lines[idx]) or re_else.search(lines[idx]) is None: + idx += 1 + idx += 1 + indent = find_indent(lines[idx]) + while find_indent(lines[idx]) >= indent or is_empty_line(lines[idx]): + idx += 1 + # Remove the import from utils + elif re_is_xxx_available.search(lines[idx]) is not None: + line = lines[idx] + for framework in to_remove: + line = line.replace(f", is_{framework}_available", "") + line = line.replace(f"is_{framework}_available, ", "") + line = line.replace(f"is_{framework}_available,", "") + line = line.replace(f"is_{framework}_available", "") + + if len(line.strip()) > 0: + new_lines.append(line) + idx += 1 + # Otherwise we keep the line, except if it's a tokenizer import and we don't want to keep it. + elif keep_processing or ( + re.search(r'^\s*"(tokenization|processing|feature_extraction|image_processing)', lines[idx]) is None + and re.search(r"^\s*from .(tokenization|processing|feature_extraction|image_processing)", lines[idx]) + is None + ): + new_lines.append(lines[idx]) + idx += 1 + else: + idx += 1 + + with open(init_file, "w", encoding="utf-8") as f: + f.write("\n".join(new_lines)) + + +def add_model_to_main_init( + old_model_patterns: ModelPatterns, + new_model_patterns: ModelPatterns, + frameworks: Optional[list[str]] = None, + with_processing: bool = True, +): + """ + Add a model to the main init of Transformers. + + Args: + old_model_patterns (`ModelPatterns`): The patterns for the old model. + new_model_patterns (`ModelPatterns`): The patterns for the new model. + frameworks (`list[str]`, *optional*): + If specified, only the models implemented in those frameworks will be added. + with_processing (`bool`, *optional*, defaults to `True`): + Whether the tokenizer/feature extractor/processor of the model should also be added to the init or not. + """ + with open(TRANSFORMERS_PATH / "__init__.py", "r", encoding="utf-8") as f: + content = f.read() + + lines = content.split("\n") + idx = 0 + new_lines = [] + framework = None + while idx < len(lines): + new_framework = False + if not is_empty_line(lines[idx]) and find_indent(lines[idx]) == 0: + framework = None + elif lines[idx].lstrip().startswith("if not is_torch_available"): + framework = "pt" + new_framework = True + elif lines[idx].lstrip().startswith("if not is_tf_available"): + framework = "tf" + new_framework = True + elif lines[idx].lstrip().startswith("if not is_flax_available"): + framework = "flax" + new_framework = True + + if new_framework: + # For a new framework, we need to skip until the else: block to get where the imports are. + while lines[idx].strip() != "else:": + new_lines.append(lines[idx]) + idx += 1 + + # Skip if we are in a framework not wanted. + if framework is not None and frameworks is not None and framework not in frameworks: + new_lines.append(lines[idx]) + idx += 1 + elif re.search(rf'models.{old_model_patterns.model_lower_cased}( |")', lines[idx]) is not None: + block = [lines[idx]] + indent = find_indent(lines[idx]) + idx += 1 + while find_indent(lines[idx]) > indent: + block.append(lines[idx]) + idx += 1 + if lines[idx].strip() in [")", "]", "],"]: + block.append(lines[idx]) + idx += 1 + block = "\n".join(block) + new_lines.append(block) + + add_block = True + if not with_processing: + processing_classes = [ + old_model_patterns.tokenizer_class, + old_model_patterns.image_processor_class, + old_model_patterns.image_processor_fast_class, + old_model_patterns.feature_extractor_class, + old_model_patterns.processor_class, + ] + # Only keep the ones that are not None + processing_classes = [c for c in processing_classes if c is not None] + for processing_class in processing_classes: + block = block.replace(f' "{processing_class}",', "") + block = block.replace(f', "{processing_class}"', "") + block = block.replace(f" {processing_class},", "") + block = block.replace(f", {processing_class}", "") + + if processing_class in block: + add_block = False + if add_block: + new_lines.append(replace_model_patterns(block, old_model_patterns, new_model_patterns)[0]) + else: + new_lines.append(lines[idx]) + idx += 1 + + with open(TRANSFORMERS_PATH / "__init__.py", "w", encoding="utf-8") as f: + f.write("\n".join(new_lines)) + + +def insert_tokenizer_in_auto_module(old_model_patterns: ModelPatterns, new_model_patterns: ModelPatterns): + """ + Add a tokenizer to the relevant mappings in the auto module. + + Args: + old_model_patterns (`ModelPatterns`): The patterns for the old model. + new_model_patterns (`ModelPatterns`): The patterns for the new model. + """ + if old_model_patterns.tokenizer_class is None or new_model_patterns.tokenizer_class is None: + return + + with open(TRANSFORMERS_PATH / "models" / "auto" / "tokenization_auto.py", "r", encoding="utf-8") as f: + content = f.read() + + pattern_tokenizer = re.compile(r"^\s*TOKENIZER_MAPPING_NAMES\s*=\s*OrderedDict\b") + lines = content.split("\n") + idx = 0 + # First we get to the TOKENIZER_MAPPING_NAMES block. + while not pattern_tokenizer.search(lines[idx]): + idx += 1 + idx += 1 + + # That block will end at this prompt: + while not lines[idx].startswith("TOKENIZER_MAPPING = _LazyAutoMapping"): + # Either all the tokenizer block is defined on one line, in which case, it ends with ")," + if lines[idx].endswith(","): + block = lines[idx] + # Otherwise it takes several lines until we get to a ")," + else: + block = [] + # should change to " )," instead of " )," + while not lines[idx].startswith(" ),"): + block.append(lines[idx]) + idx += 1 + # if the lines[idx] does start with " )," we still need it in our block + block.append(lines[idx]) + block = "\n".join(block) + idx += 1 + + # If we find the model type and tokenizer class in that block, we have the old model tokenizer block + if f'"{old_model_patterns.model_type}"' in block and old_model_patterns.tokenizer_class in block: + break + + new_block = block.replace(old_model_patterns.model_type, new_model_patterns.model_type) + new_block = new_block.replace(old_model_patterns.tokenizer_class, new_model_patterns.tokenizer_class) + + new_lines = lines[:idx] + [new_block] + lines[idx:] + with open(TRANSFORMERS_PATH / "models" / "auto" / "tokenization_auto.py", "w", encoding="utf-8") as f: + f.write("\n".join(new_lines)) + + +AUTO_CLASSES_PATTERNS = { + "configuration_auto.py": [ + ' ("{model_type}", "{model_name}"),', + ' ("{model_type}", "{config_class}"),', + ' ("{model_type}", "{pretrained_archive_map}"),', + ], + "feature_extraction_auto.py": [' ("{model_type}", "{feature_extractor_class}"),'], + "image_processing_auto.py": [' ("{model_type}", "{image_processor_classes}"),'], + "modeling_auto.py": [' ("{model_type}", "{any_pt_class}"),'], + "modeling_tf_auto.py": [' ("{model_type}", "{any_tf_class}"),'], + "modeling_flax_auto.py": [' ("{model_type}", "{any_flax_class}"),'], + "processing_auto.py": [' ("{model_type}", "{processor_class}"),'], +} + + +def add_model_to_auto_classes( + old_model_patterns: ModelPatterns, new_model_patterns: ModelPatterns, model_classes: dict[str, list[str]] +): + """ + Add a model to the relevant mappings in the auto module. + + Args: + old_model_patterns (`ModelPatterns`): The patterns for the old model. + new_model_patterns (`ModelPatterns`): The patterns for the new model. + model_classes (`dict[str, list[str]]`): A dictionary framework to list of model classes implemented. + """ + for filename in AUTO_CLASSES_PATTERNS: + # Extend patterns with all model classes if necessary + new_patterns = [] + for pattern in AUTO_CLASSES_PATTERNS[filename]: + if re.search("any_([a-z]*)_class", pattern) is not None: + framework = re.search("any_([a-z]*)_class", pattern).groups()[0] + if framework in model_classes: + new_patterns.extend( + [ + pattern.replace("{" + f"any_{framework}_class" + "}", cls) + for cls in model_classes[framework] + ] + ) + elif "{config_class}" in pattern: + new_patterns.append(pattern.replace("{config_class}", old_model_patterns.config_class)) + elif "{image_processor_classes}" in pattern: + if ( + old_model_patterns.image_processor_class is not None + and new_model_patterns.image_processor_class is not None + ): + if ( + old_model_patterns.image_processor_fast_class is not None + and new_model_patterns.image_processor_fast_class is not None + ): + new_patterns.append( + pattern.replace( + '"{image_processor_classes}"', + f'("{old_model_patterns.image_processor_class}", "{old_model_patterns.image_processor_fast_class}")', + ) + ) + else: + new_patterns.append( + pattern.replace( + '"{image_processor_classes}"', f'("{old_model_patterns.image_processor_class}",)' + ) + ) + elif "{feature_extractor_class}" in pattern: + if ( + old_model_patterns.feature_extractor_class is not None + and new_model_patterns.feature_extractor_class is not None + ): + new_patterns.append( + pattern.replace("{feature_extractor_class}", old_model_patterns.feature_extractor_class) + ) + elif "{processor_class}" in pattern: + if old_model_patterns.processor_class is not None and new_model_patterns.processor_class is not None: + new_patterns.append(pattern.replace("{processor_class}", old_model_patterns.processor_class)) + else: + new_patterns.append(pattern) + + # Loop through all patterns. + for pattern in new_patterns: + full_name = TRANSFORMERS_PATH / "models" / "auto" / filename + old_model_line = pattern + new_model_line = pattern + for attr in ["model_type", "model_name"]: + old_model_line = old_model_line.replace("{" + attr + "}", getattr(old_model_patterns, attr)) + new_model_line = new_model_line.replace("{" + attr + "}", getattr(new_model_patterns, attr)) + new_model_line = new_model_line.replace( + old_model_patterns.model_camel_cased, new_model_patterns.model_camel_cased + ) + add_content_to_file(full_name, new_model_line, add_after=old_model_line) + + # Tokenizers require special handling + insert_tokenizer_in_auto_module(old_model_patterns, new_model_patterns) + + +DOC_OVERVIEW_TEMPLATE = """## Overview + +The {model_name} model was proposed in []() by . + + +The abstract from the paper is the following: + +** + +Tips: + + + +This model was contributed by [INSERT YOUR HF USERNAME HERE](https://huggingface.co/). +The original code can be found [here](). + +""" + + +def duplicate_doc_file( + doc_file: Union[str, os.PathLike], + old_model_patterns: ModelPatterns, + new_model_patterns: ModelPatterns, + dest_file: Optional[Union[str, os.PathLike]] = None, + frameworks: Optional[list[str]] = None, +): + """ + Duplicate a documentation file and adapts it for a new model. + + Args: + module_file (`str` or `os.PathLike`): Path to the doc file to duplicate. + old_model_patterns (`ModelPatterns`): The patterns for the old model. + new_model_patterns (`ModelPatterns`): The patterns for the new model. + dest_file (`str` or `os.PathLike`, *optional*): Path to the new doc file. + Will default to the a file named `{new_model_patterns.model_type}.md` in the same folder as `module_file`. + frameworks (`list[str]`, *optional*): + If passed, will only keep the model classes corresponding to this list of frameworks in the new doc file. + """ + with open(doc_file, "r", encoding="utf-8") as f: + content = f.read() + + content = re.sub(r" There are 2 Layer Norms per Transformer Block + p.data.normal_(mean=0.0, std=(self.config.initializer_range / math.sqrt(2 * self.config.n_layer))) + + +class DecisionTransformerGPT2Model(DecisionTransformerGPT2PreTrainedModel): + def __init__(self, config): + super().__init__(config) + + self.embed_dim = config.hidden_size + + self.wte = nn.Embedding(config.vocab_size, self.embed_dim) + self.wpe = nn.Embedding(config.max_position_embeddings, self.embed_dim) + + self.drop = nn.Dropout(config.embd_pdrop) + self.h = nn.ModuleList( + [DecisionTransformerGPT2Block(config, layer_idx=i) for i in range(config.num_hidden_layers)] + ) + self.ln_f = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_epsilon) + + # Model parallel + self.model_parallel = False + self.device_map = None + self.gradient_checkpointing = False + + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.wte + + def set_input_embeddings(self, new_embeddings): + self.wte = new_embeddings + + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[tuple[tuple[torch.Tensor]]] = None, + cache_position: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.FloatTensor] = None, + token_type_ids: Optional[torch.LongTensor] = None, + position_ids: Optional[torch.LongTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + encoder_hidden_states: Optional[torch.Tensor] = None, + encoder_attention_mask: Optional[torch.FloatTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[tuple, BaseModelOutputWithPastAndCrossAttentions]: + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + use_cache = use_cache if use_cache is not None else self.config.use_cache + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if input_ids is not None and inputs_embeds is not None: + raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") + elif input_ids is not None: + self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask) + input_shape = input_ids.size() + input_ids = input_ids.view(-1, input_shape[-1]) + batch_size = input_ids.shape[0] + elif inputs_embeds is not None: + input_shape = inputs_embeds.size()[:-1] + batch_size = inputs_embeds.shape[0] + else: + raise ValueError("You have to specify either input_ids or inputs_embeds") + + device = input_ids.device if input_ids is not None else inputs_embeds.device + + if token_type_ids is not None: + token_type_ids = token_type_ids.view(-1, input_shape[-1]) + + # based on pattern from src/transformers/models/whisper/modeling_whisper.py::WhisperDecoder and similar addition in GPT2Model + return_legacy_cache = False + if use_cache: + if past_key_values is None: + return_legacy_cache = True + past_key_values = DynamicCache() + elif not isinstance(past_key_values, Cache): + return_legacy_cache = True + logger.warning_once( + "Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.53.0. " + "You should pass an instance of `Cache` instead, e.g. " + "`past_key_values=DynamicCache.from_legacy_cache(past_key_values)`." + ) + past_key_values = DynamicCache.from_legacy_cache(past_key_values) + + if self.config.add_cross_attention and not isinstance(past_key_values, EncoderDecoderCache): + past_key_values = EncoderDecoderCache(past_key_values, DynamicCache()) + + if cache_position is None: + past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0 + cache_position = torch.arange( + past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device + ) + if position_ids is None: + position_ids = cache_position.unsqueeze(0) + + # Attention mask. + if attention_mask is not None: + if batch_size <= 0: + raise ValueError("batch_size has to be defined and > 0") + attention_mask = attention_mask.view(batch_size, -1) + # We create a 3D attention mask from a 2D tensor mask. + # Sizes are [batch_size, 1, 1, to_seq_length] + # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length] + # this attention mask is more simple than the triangular masking of causal attention + # used in OpenAI GPT, we just need to prepare the broadcast dimension here. + attention_mask = attention_mask[:, None, None, :] + + # Since attention_mask is 1.0 for positions we want to attend and 0.0 for + # masked positions, this operation will create a tensor which is 0.0 for + # positions we want to attend and the dtype's smallest value for masked positions. + # Since we are adding it to the raw scores before the softmax, this is + # effectively the same as removing these entirely. + attention_mask = attention_mask.to(dtype=self.dtype) # fp16 compatibility + attention_mask = (1.0 - attention_mask) * torch.finfo(self.dtype).min + + # If a 2D or 3D attention mask is provided for the cross-attention + # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length] + if self.config.add_cross_attention and encoder_hidden_states is not None: + encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size() + encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length) + if encoder_attention_mask is None: + encoder_attention_mask = torch.ones(encoder_hidden_shape, device=device) + encoder_attention_mask = self.invert_attention_mask(encoder_attention_mask) + else: + encoder_attention_mask = None + + # Prepare head mask if needed + # 1.0 in head_mask indicate we keep the head + # attention_probs has shape bsz x n_heads x N x N + # head_mask has shape n_layer x batch x n_heads x N x N + head_mask = self.get_head_mask(head_mask, self.config.n_layer) + + if inputs_embeds is None: + inputs_embeds = self.wte(input_ids) + position_embeds = self.wpe(position_ids) + hidden_states = inputs_embeds + position_embeds + + if token_type_ids is not None: + token_type_embeds = self.wte(token_type_ids) + hidden_states = hidden_states + token_type_embeds + + hidden_states = self.drop(hidden_states) + + output_shape = (-1,) + input_shape[1:] + (hidden_states.size(-1),) + + if self.gradient_checkpointing and self.training: + if use_cache: + logger.warning_once( + "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..." + ) + use_cache = False + + all_self_attentions = () if output_attentions else None + all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None + all_hidden_states = () if output_hidden_states else None + for i, block in enumerate(self.h): + # Model parallel + if self.model_parallel: + torch.cuda.set_device(hidden_states.device) + # Ensure that attention_mask is always on the same device as hidden_states + if attention_mask is not None: + attention_mask = attention_mask.to(hidden_states.device) + if isinstance(head_mask, torch.Tensor): + head_mask = head_mask.to(hidden_states.device) + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + outputs = block( + hidden_states, + past_key_values if not (self.gradient_checkpointing and self.training) else None, + cache_position, + attention_mask, + head_mask[i], + encoder_hidden_states, # as a positional argument for gradient checkpointing + encoder_attention_mask=encoder_attention_mask, + use_cache=use_cache, + output_attentions=output_attentions, + ) + + hidden_states = outputs[0] + + if output_attentions: + all_self_attentions = all_self_attentions + (outputs[1],) + if self.config.add_cross_attention: + all_cross_attentions = all_cross_attentions + (outputs[2],) + + # Model Parallel: If it's the last layer for that device, put things on the next device + if self.model_parallel: + for k, v in self.device_map.items(): + if i == v[-1] and "cuda:" + str(k) != self.last_device: + hidden_states = hidden_states.to("cuda:" + str(k + 1)) + + hidden_states = self.ln_f(hidden_states) + + hidden_states = hidden_states.view(output_shape) + # Add last hidden state + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + past_key_values = past_key_values if use_cache else None + if return_legacy_cache: + past_key_values = ( + past_key_values.self_attention_cache.to_legacy_cache() + if self.config.add_cross_attention + else past_key_values.to_legacy_cache() + ) + if not return_dict: + return tuple( + v + for v in [hidden_states, past_key_values, all_hidden_states, all_self_attentions, all_cross_attentions] + if v is not None + ) + + return BaseModelOutputWithPastAndCrossAttentions( + last_hidden_state=hidden_states, + past_key_values=past_key_values, + hidden_states=all_hidden_states, + attentions=all_self_attentions, + cross_attentions=all_cross_attentions, + ) + + +@dataclass +@auto_docstring( + custom_intro=""" + Base class for model's outputs that also contains a pooling of the last hidden states. + """ +) +class DecisionTransformerOutput(ModelOutput): + r""" + state_preds (`torch.FloatTensor` of shape `(batch_size, sequence_length, state_dim)`): + Environment state predictions + action_preds (`torch.FloatTensor` of shape `(batch_size, sequence_length, action_dim)`): + Model action predictions + return_preds (`torch.FloatTensor` of shape `(batch_size, sequence_length, 1)`): + Predicted returns for each state + """ + + state_preds: Optional[torch.FloatTensor] = None + action_preds: Optional[torch.FloatTensor] = None + return_preds: Optional[torch.FloatTensor] = None + hidden_states: Optional[torch.FloatTensor] = None + attentions: Optional[torch.FloatTensor] = None + last_hidden_state: Optional[torch.FloatTensor] = None + + +class DecisionTransformerPreTrainedModel(PreTrainedModel): + """ + An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained + models. + """ + + config_class = DecisionTransformerConfig + base_model_prefix = "decision_transformer" + main_input_name = "states" + supports_gradient_checkpointing = False + + def _init_weights(self, module): + """Initialize the weights""" + if isinstance(module, nn.Linear): + # Slightly different from the TF version which uses truncated_normal for initialization + # cf https://github.com/pytorch/pytorch/pull/5617 + module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.Embedding): + module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + if module.padding_idx is not None: + module.weight.data[module.padding_idx].zero_() + elif isinstance(module, nn.LayerNorm): + module.bias.data.zero_() + module.weight.data.fill_(1.0) + + +@auto_docstring( + custom_intro=""" + The Decision Transformer Model + """ +) +class DecisionTransformerModel(DecisionTransformerPreTrainedModel): + """ + + The model builds upon the GPT2 architecture to perform autoregressive prediction of actions in an offline RL + setting. Refer to the paper for more details: https://huggingface.co/papers/2106.01345 + + """ + + def __init__(self, config): + super().__init__(config) + self.config = config + self.hidden_size = config.hidden_size + # note: the only difference between this GPT2Model and the default Huggingface version + # is that the positional embeddings are removed (since we'll add those ourselves) + self.encoder = DecisionTransformerGPT2Model(config) + + self.embed_timestep = nn.Embedding(config.max_ep_len, config.hidden_size) + self.embed_return = torch.nn.Linear(1, config.hidden_size) + self.embed_state = torch.nn.Linear(config.state_dim, config.hidden_size) + self.embed_action = torch.nn.Linear(config.act_dim, config.hidden_size) + + self.embed_ln = nn.LayerNorm(config.hidden_size) + + # note: we don't predict states or returns for the paper + self.predict_state = torch.nn.Linear(config.hidden_size, config.state_dim) + self.predict_action = nn.Sequential( + *([nn.Linear(config.hidden_size, config.act_dim)] + ([nn.Tanh()] if config.action_tanh else [])) + ) + self.predict_return = torch.nn.Linear(config.hidden_size, 1) + + # Initialize weights and apply final processing + self.post_init() + + @auto_docstring + def forward( + self, + states: Optional[torch.FloatTensor] = None, + actions: Optional[torch.FloatTensor] = None, + rewards: Optional[torch.FloatTensor] = None, + returns_to_go: Optional[torch.FloatTensor] = None, + timesteps: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.FloatTensor] = None, + output_hidden_states: Optional[bool] = None, + output_attentions: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[tuple[torch.FloatTensor], DecisionTransformerOutput]: + r""" + states (`torch.FloatTensor` of shape `(batch_size, episode_length, state_dim)`): + The states for each step in the trajectory + actions (`torch.FloatTensor` of shape `(batch_size, episode_length, act_dim)`): + The actions taken by the "expert" policy for the current state, these are masked for auto regressive + prediction + rewards (`torch.FloatTensor` of shape `(batch_size, episode_length, 1)`): + The rewards for each state, action + returns_to_go (`torch.FloatTensor` of shape `(batch_size, episode_length, 1)`): + The returns for each state in the trajectory + timesteps (`torch.LongTensor` of shape `(batch_size, episode_length)`): + The timestep for each step in the trajectory + + Examples: + + ```python + >>> from transformers import DecisionTransformerModel + >>> import torch + + >>> model = DecisionTransformerModel.from_pretrained("edbeeching/decision-transformer-gym-hopper-medium") + >>> # evaluation + >>> model = model.to(device) + >>> model.eval() + + >>> env = gym.make("Hopper-v3") + >>> state_dim = env.observation_space.shape[0] + >>> act_dim = env.action_space.shape[0] + + >>> state = env.reset() + >>> states = torch.from_numpy(state).reshape(1, 1, state_dim).to(device=device, dtype=torch.float32) + >>> actions = torch.zeros((1, 1, act_dim), device=device, dtype=torch.float32) + >>> rewards = torch.zeros(1, 1, device=device, dtype=torch.float32) + >>> target_return = torch.tensor(TARGET_RETURN, dtype=torch.float32).reshape(1, 1) + >>> timesteps = torch.tensor(0, device=device, dtype=torch.long).reshape(1, 1) + >>> attention_mask = torch.zeros(1, 1, device=device, dtype=torch.float32) + + >>> # forward pass + >>> with torch.no_grad(): + ... state_preds, action_preds, return_preds = model( + ... states=states, + ... actions=actions, + ... rewards=rewards, + ... returns_to_go=target_return, + ... timesteps=timesteps, + ... attention_mask=attention_mask, + ... return_dict=False, + ... ) + ```""" + + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + batch_size, seq_length = states.shape[0], states.shape[1] + + if attention_mask is None: + # attention mask for GPT: 1 if can be attended to, 0 if not + attention_mask = torch.ones((batch_size, seq_length), dtype=torch.long) + + # embed each modality with a different head + state_embeddings = self.embed_state(states) + action_embeddings = self.embed_action(actions) + returns_embeddings = self.embed_return(returns_to_go) + time_embeddings = self.embed_timestep(timesteps) + + # time embeddings are treated similar to positional embeddings + state_embeddings = state_embeddings + time_embeddings + action_embeddings = action_embeddings + time_embeddings + returns_embeddings = returns_embeddings + time_embeddings + + # this makes the sequence look like (R_1, s_1, a_1, R_2, s_2, a_2, ...) + # which works nice in an autoregressive sense since states predict actions + stacked_inputs = ( + torch.stack((returns_embeddings, state_embeddings, action_embeddings), dim=1) + .permute(0, 2, 1, 3) + .reshape(batch_size, 3 * seq_length, self.hidden_size) + ) + stacked_inputs = self.embed_ln(stacked_inputs) + + # to make the attention mask fit the stacked inputs, have to stack it as well + stacked_attention_mask = ( + torch.stack((attention_mask, attention_mask, attention_mask), dim=1) + .permute(0, 2, 1) + .reshape(batch_size, 3 * seq_length) + ) + device = stacked_inputs.device + # we feed in the input embeddings (not word indices as in NLP) to the model + encoder_outputs = self.encoder( + inputs_embeds=stacked_inputs, + attention_mask=stacked_attention_mask, + position_ids=torch.zeros(stacked_attention_mask.shape, device=device, dtype=torch.long), + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + x = encoder_outputs[0] + + # reshape x so that the second dimension corresponds to the original + # returns (0), states (1), or actions (2); i.e. x[:,1,t] is the token for s_t + x = x.reshape(batch_size, seq_length, 3, self.hidden_size).permute(0, 2, 1, 3) + + # get predictions + return_preds = self.predict_return(x[:, 2]) # predict next return given state and action + state_preds = self.predict_state(x[:, 2]) # predict next state given state and action + action_preds = self.predict_action(x[:, 1]) # predict next action given state + if not return_dict: + return (state_preds, action_preds, return_preds) + + return DecisionTransformerOutput( + last_hidden_state=encoder_outputs.last_hidden_state, + state_preds=state_preds, + action_preds=action_preds, + return_preds=return_preds, + hidden_states=encoder_outputs.hidden_states, + attentions=encoder_outputs.attentions, + ) + + +__all__ = [ + "DecisionTransformerGPT2Model", + "DecisionTransformerGPT2PreTrainedModel", + "DecisionTransformerModel", + "DecisionTransformerPreTrainedModel", +] diff --git a/transformers/src/transformers/models/depth_anything/__init__.py b/transformers/src/transformers/models/depth_anything/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..7425e37e0399c792155a88488045176fb3b5e7a5 --- /dev/null +++ b/transformers/src/transformers/models/depth_anything/__init__.py @@ -0,0 +1,27 @@ +# Copyright 2024 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import TYPE_CHECKING + +from ...utils import _LazyModule +from ...utils.import_utils import define_import_structure + + +if TYPE_CHECKING: + from .configuration_depth_anything import * + from .modeling_depth_anything import * +else: + import sys + + _file = globals()["__file__"] + sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__) diff --git a/transformers/src/transformers/models/depth_anything/configuration_depth_anything.py b/transformers/src/transformers/models/depth_anything/configuration_depth_anything.py new file mode 100644 index 0000000000000000000000000000000000000000..b28508cbe9f039856535bd5d6ed276eb69711c51 --- /dev/null +++ b/transformers/src/transformers/models/depth_anything/configuration_depth_anything.py @@ -0,0 +1,168 @@ +# coding=utf-8 +# Copyright 2024 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""DepthAnything model configuration""" + +import copy + +from ...configuration_utils import PretrainedConfig +from ...utils import logging +from ...utils.backbone_utils import verify_backbone_config_arguments +from ..auto.configuration_auto import CONFIG_MAPPING + + +logger = logging.get_logger(__name__) + + +class DepthAnythingConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`DepthAnythingModel`]. It is used to instantiate a DepthAnything + model according to the specified arguments, defining the model architecture. Instantiating a configuration with the + defaults will yield a similar configuration to that of the DepthAnything + [LiheYoung/depth-anything-small-hf](https://huggingface.co/LiheYoung/depth-anything-small-hf) architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + Args: + backbone_config (`Union[dict[str, Any], PretrainedConfig]`, *optional*): + The configuration of the backbone model. Only used in case `is_hybrid` is `True` or in case you want to + leverage the [`AutoBackbone`] API. + backbone (`str`, *optional*): + Name of backbone to use when `backbone_config` is `None`. If `use_pretrained_backbone` is `True`, this + will load the corresponding pretrained weights from the timm or transformers library. If `use_pretrained_backbone` + is `False`, this loads the backbone's config and uses that to initialize the backbone with random weights. + use_pretrained_backbone (`bool`, *optional*, defaults to `False`): + Whether to use pretrained weights for the backbone. + use_timm_backbone (`bool`, *optional*, defaults to `False`): + Whether or not to use the `timm` library for the backbone. If set to `False`, will use the [`AutoBackbone`] + API. + backbone_kwargs (`dict`, *optional*): + Keyword arguments to be passed to AutoBackbone when loading from a checkpoint + e.g. `{'out_indices': (0, 1, 2, 3)}`. Cannot be specified if `backbone_config` is set. + patch_size (`int`, *optional*, defaults to 14): + The size of the patches to extract from the backbone features. + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + reassemble_hidden_size (`int`, *optional*, defaults to 384): + The number of input channels of the reassemble layers. + reassemble_factors (`list[int]`, *optional*, defaults to `[4, 2, 1, 0.5]`): + The up/downsampling factors of the reassemble layers. + neck_hidden_sizes (`list[str]`, *optional*, defaults to `[48, 96, 192, 384]`): + The hidden sizes to project to for the feature maps of the backbone. + fusion_hidden_size (`int`, *optional*, defaults to 64): + The number of channels before fusion. + head_in_index (`int`, *optional*, defaults to -1): + The index of the features to use in the depth estimation head. + head_hidden_size (`int`, *optional*, defaults to 32): + The number of output channels in the second convolution of the depth estimation head. + depth_estimation_type (`str`, *optional*, defaults to `"relative"`): + The type of depth estimation to use. Can be one of `["relative", "metric"]`. + max_depth (`float`, *optional*): + The maximum depth to use for the "metric" depth estimation head. 20 should be used for indoor models + and 80 for outdoor models. For "relative" depth estimation, this value is ignored. + + Example: + + ```python + >>> from transformers import DepthAnythingConfig, DepthAnythingForDepthEstimation + + >>> # Initializing a DepthAnything small style configuration + >>> configuration = DepthAnythingConfig() + + >>> # Initializing a model from the DepthAnything small style configuration + >>> model = DepthAnythingForDepthEstimation(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + + model_type = "depth_anything" + + def __init__( + self, + backbone_config=None, + backbone=None, + use_pretrained_backbone=False, + use_timm_backbone=False, + backbone_kwargs=None, + patch_size=14, + initializer_range=0.02, + reassemble_hidden_size=384, + reassemble_factors=[4, 2, 1, 0.5], + neck_hidden_sizes=[48, 96, 192, 384], + fusion_hidden_size=64, + head_in_index=-1, + head_hidden_size=32, + depth_estimation_type="relative", + max_depth=None, + **kwargs, + ): + super().__init__(**kwargs) + if backbone_config is None and backbone is None: + logger.info("`backbone_config` is `None`. Initializing the config with the default `Dinov2` backbone.") + backbone_config = CONFIG_MAPPING["dinov2"]( + image_size=518, + hidden_size=384, + num_attention_heads=6, + out_indices=[9, 10, 11, 12], + apply_layernorm=True, + reshape_hidden_states=False, + ) + elif isinstance(backbone_config, dict): + backbone_model_type = backbone_config.get("model_type") + config_class = CONFIG_MAPPING[backbone_model_type] + backbone_config = config_class.from_dict(backbone_config) + + verify_backbone_config_arguments( + use_timm_backbone=use_timm_backbone, + use_pretrained_backbone=use_pretrained_backbone, + backbone=backbone, + backbone_config=backbone_config, + backbone_kwargs=backbone_kwargs, + ) + + self.backbone_config = backbone_config + self.backbone = backbone + self.use_pretrained_backbone = use_pretrained_backbone + self.use_timm_backbone = use_timm_backbone + self.backbone_kwargs = backbone_kwargs + self.reassemble_hidden_size = reassemble_hidden_size + self.patch_size = patch_size + self.initializer_range = initializer_range + self.reassemble_factors = reassemble_factors + self.neck_hidden_sizes = neck_hidden_sizes + self.fusion_hidden_size = fusion_hidden_size + self.head_in_index = head_in_index + self.head_hidden_size = head_hidden_size + if depth_estimation_type not in ["relative", "metric"]: + raise ValueError("depth_estimation_type must be one of ['relative', 'metric']") + self.depth_estimation_type = depth_estimation_type + self.max_depth = max_depth if max_depth else 1 + + def to_dict(self): + """ + Serializes this instance to a Python dictionary. Override the default [`~PretrainedConfig.to_dict`]. Returns: + `dict[str, any]`: Dictionary of all the attributes that make up this configuration instance, + """ + output = copy.deepcopy(self.__dict__) + + if output["backbone_config"] is not None: + output["backbone_config"] = self.backbone_config.to_dict() + + output["model_type"] = self.__class__.model_type + return output + + +__all__ = ["DepthAnythingConfig"] diff --git a/transformers/src/transformers/models/depth_anything/convert_depth_anything_to_hf.py b/transformers/src/transformers/models/depth_anything/convert_depth_anything_to_hf.py new file mode 100644 index 0000000000000000000000000000000000000000..f07a76b2b2353446dd6245adf79f593af08039ab --- /dev/null +++ b/transformers/src/transformers/models/depth_anything/convert_depth_anything_to_hf.py @@ -0,0 +1,368 @@ +# coding=utf-8 +# Copyright 2024 The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Convert Depth Anything checkpoints from the original repository. URL: +https://github.com/LiheYoung/Depth-Anything""" + +import argparse +from pathlib import Path + +import requests +import torch +from huggingface_hub import hf_hub_download +from PIL import Image + +from transformers import DepthAnythingConfig, DepthAnythingForDepthEstimation, Dinov2Config, DPTImageProcessor +from transformers.utils import logging + + +logging.set_verbosity_info() +logger = logging.get_logger(__name__) + + +def get_dpt_config(model_name): + if "small" in model_name: + out_indices = [3, 6, 9, 12] if "v2" in model_name else [9, 10, 11, 12] + backbone_config = Dinov2Config.from_pretrained( + "facebook/dinov2-small", out_indices=out_indices, apply_layernorm=True, reshape_hidden_states=False + ) + fusion_hidden_size = 64 + neck_hidden_sizes = [48, 96, 192, 384] + elif "base" in model_name: + out_indices = [3, 6, 9, 12] if "v2" in model_name else [9, 10, 11, 12] + backbone_config = Dinov2Config.from_pretrained( + "facebook/dinov2-base", out_indices=out_indices, apply_layernorm=True, reshape_hidden_states=False + ) + fusion_hidden_size = 128 + neck_hidden_sizes = [96, 192, 384, 768] + elif "large" in model_name: + out_indices = [5, 12, 18, 24] if "v2" in model_name else [21, 22, 23, 24] + backbone_config = Dinov2Config.from_pretrained( + "facebook/dinov2-large", out_indices=out_indices, apply_layernorm=True, reshape_hidden_states=False + ) + fusion_hidden_size = 256 + neck_hidden_sizes = [256, 512, 1024, 1024] + else: + raise NotImplementedError(f"Model not supported: {model_name}") + + if "metric" in model_name: + depth_estimation_type = "metric" + max_depth = 20 if "indoor" in model_name else 80 + else: + depth_estimation_type = "relative" + max_depth = None + + config = DepthAnythingConfig( + reassemble_hidden_size=backbone_config.hidden_size, + patch_size=backbone_config.patch_size, + backbone_config=backbone_config, + fusion_hidden_size=fusion_hidden_size, + neck_hidden_sizes=neck_hidden_sizes, + depth_estimation_type=depth_estimation_type, + max_depth=max_depth, + ) + + return config + + +def create_rename_keys(config): + rename_keys = [] + + # fmt: off + # stem + rename_keys.append(("pretrained.cls_token", "backbone.embeddings.cls_token")) + rename_keys.append(("pretrained.mask_token", "backbone.embeddings.mask_token")) + rename_keys.append(("pretrained.pos_embed", "backbone.embeddings.position_embeddings")) + rename_keys.append(("pretrained.patch_embed.proj.weight", "backbone.embeddings.patch_embeddings.projection.weight")) + rename_keys.append(("pretrained.patch_embed.proj.bias", "backbone.embeddings.patch_embeddings.projection.bias")) + + # Transformer encoder + for i in range(config.backbone_config.num_hidden_layers): + rename_keys.append((f"pretrained.blocks.{i}.ls1.gamma", f"backbone.encoder.layer.{i}.layer_scale1.lambda1")) + rename_keys.append((f"pretrained.blocks.{i}.ls2.gamma", f"backbone.encoder.layer.{i}.layer_scale2.lambda1")) + rename_keys.append((f"pretrained.blocks.{i}.norm1.weight", f"backbone.encoder.layer.{i}.norm1.weight")) + rename_keys.append((f"pretrained.blocks.{i}.norm1.bias", f"backbone.encoder.layer.{i}.norm1.bias")) + rename_keys.append((f"pretrained.blocks.{i}.norm2.weight", f"backbone.encoder.layer.{i}.norm2.weight")) + rename_keys.append((f"pretrained.blocks.{i}.norm2.bias", f"backbone.encoder.layer.{i}.norm2.bias")) + rename_keys.append((f"pretrained.blocks.{i}.mlp.fc1.weight", f"backbone.encoder.layer.{i}.mlp.fc1.weight")) + rename_keys.append((f"pretrained.blocks.{i}.mlp.fc1.bias", f"backbone.encoder.layer.{i}.mlp.fc1.bias")) + rename_keys.append((f"pretrained.blocks.{i}.mlp.fc2.weight", f"backbone.encoder.layer.{i}.mlp.fc2.weight")) + rename_keys.append((f"pretrained.blocks.{i}.mlp.fc2.bias", f"backbone.encoder.layer.{i}.mlp.fc2.bias")) + rename_keys.append((f"pretrained.blocks.{i}.attn.proj.weight", f"backbone.encoder.layer.{i}.attention.output.dense.weight")) + rename_keys.append((f"pretrained.blocks.{i}.attn.proj.bias", f"backbone.encoder.layer.{i}.attention.output.dense.bias")) + + # Head + rename_keys.append(("pretrained.norm.weight", "backbone.layernorm.weight")) + rename_keys.append(("pretrained.norm.bias", "backbone.layernorm.bias")) + + # activation postprocessing (readout projections + resize blocks) + # Depth Anything does not use CLS token => readout_projects not required + + for i in range(4): + rename_keys.append((f"depth_head.projects.{i}.weight", f"neck.reassemble_stage.layers.{i}.projection.weight")) + rename_keys.append((f"depth_head.projects.{i}.bias", f"neck.reassemble_stage.layers.{i}.projection.bias")) + + if i != 2: + rename_keys.append((f"depth_head.resize_layers.{i}.weight", f"neck.reassemble_stage.layers.{i}.resize.weight")) + rename_keys.append((f"depth_head.resize_layers.{i}.bias", f"neck.reassemble_stage.layers.{i}.resize.bias")) + + # refinenet (tricky here) + mapping = {1:3, 2:2, 3:1, 4:0} + + for i in range(1, 5): + j = mapping[i] + rename_keys.append((f"depth_head.scratch.refinenet{i}.out_conv.weight", f"neck.fusion_stage.layers.{j}.projection.weight")) + rename_keys.append((f"depth_head.scratch.refinenet{i}.out_conv.bias", f"neck.fusion_stage.layers.{j}.projection.bias")) + rename_keys.append((f"depth_head.scratch.refinenet{i}.resConfUnit1.conv1.weight", f"neck.fusion_stage.layers.{j}.residual_layer1.convolution1.weight")) + rename_keys.append((f"depth_head.scratch.refinenet{i}.resConfUnit1.conv1.bias", f"neck.fusion_stage.layers.{j}.residual_layer1.convolution1.bias")) + rename_keys.append((f"depth_head.scratch.refinenet{i}.resConfUnit1.conv2.weight", f"neck.fusion_stage.layers.{j}.residual_layer1.convolution2.weight")) + rename_keys.append((f"depth_head.scratch.refinenet{i}.resConfUnit1.conv2.bias", f"neck.fusion_stage.layers.{j}.residual_layer1.convolution2.bias")) + rename_keys.append((f"depth_head.scratch.refinenet{i}.resConfUnit2.conv1.weight", f"neck.fusion_stage.layers.{j}.residual_layer2.convolution1.weight")) + rename_keys.append((f"depth_head.scratch.refinenet{i}.resConfUnit2.conv1.bias", f"neck.fusion_stage.layers.{j}.residual_layer2.convolution1.bias")) + rename_keys.append((f"depth_head.scratch.refinenet{i}.resConfUnit2.conv2.weight", f"neck.fusion_stage.layers.{j}.residual_layer2.convolution2.weight")) + rename_keys.append((f"depth_head.scratch.refinenet{i}.resConfUnit2.conv2.bias", f"neck.fusion_stage.layers.{j}.residual_layer2.convolution2.bias")) + + # scratch convolutions + for i in range(4): + rename_keys.append((f"depth_head.scratch.layer{i+1}_rn.weight", f"neck.convs.{i}.weight")) + + # head + rename_keys.append(("depth_head.scratch.output_conv1.weight", "head.conv1.weight")) + rename_keys.append(("depth_head.scratch.output_conv1.bias", "head.conv1.bias")) + rename_keys.append(("depth_head.scratch.output_conv2.0.weight", "head.conv2.weight")) + rename_keys.append(("depth_head.scratch.output_conv2.0.bias", "head.conv2.bias")) + rename_keys.append(("depth_head.scratch.output_conv2.2.weight", "head.conv3.weight")) + rename_keys.append(("depth_head.scratch.output_conv2.2.bias", "head.conv3.bias")) + + return rename_keys + + +# we split up the matrix of each encoder layer into queries, keys and values +def read_in_q_k_v(state_dict, config): + hidden_size = config.backbone_config.hidden_size + for i in range(config.backbone_config.num_hidden_layers): + # read in weights + bias of input projection layer (in original implementation, this is a single matrix + bias) + in_proj_weight = state_dict.pop(f"pretrained.blocks.{i}.attn.qkv.weight") + in_proj_bias = state_dict.pop(f"pretrained.blocks.{i}.attn.qkv.bias") + # next, add query, keys and values (in that order) to the state dict + state_dict[f"backbone.encoder.layer.{i}.attention.attention.query.weight"] = in_proj_weight[:hidden_size, :] + state_dict[f"backbone.encoder.layer.{i}.attention.attention.query.bias"] = in_proj_bias[:hidden_size] + state_dict[f"backbone.encoder.layer.{i}.attention.attention.key.weight"] = in_proj_weight[ + hidden_size : hidden_size * 2, : + ] + state_dict[f"backbone.encoder.layer.{i}.attention.attention.key.bias"] = in_proj_bias[ + hidden_size : hidden_size * 2 + ] + state_dict[f"backbone.encoder.layer.{i}.attention.attention.value.weight"] = in_proj_weight[-hidden_size:, :] + state_dict[f"backbone.encoder.layer.{i}.attention.attention.value.bias"] = in_proj_bias[-hidden_size:] + + +def rename_key(dct, old, new): + val = dct.pop(old) + dct[new] = val + + +# We will verify our results on an image of cute cats +def prepare_img(): + url = "http://images.cocodataset.org/val2017/000000039769.jpg" + im = Image.open(requests.get(url, stream=True).raw) + return im + + +name_to_checkpoint = { + "depth-anything-small": "pytorch_model.bin", + "depth-anything-base": "pytorch_model.bin", + "depth-anything-large": "pytorch_model.bin", + "depth-anything-v2-small": "depth_anything_v2_vits.pth", + "depth-anything-v2-base": "depth_anything_v2_vitb.pth", + "depth-anything-v2-large": "depth_anything_v2_vitl.pth", + "depth-anything-v2-metric-indoor-small": "depth_anything_v2_metric_hypersim_vits.pth", + "depth-anything-v2-metric-indoor-base": "depth_anything_v2_metric_hypersim_vitb.pth", + "depth-anything-v2-metric-indoor-large": "depth_anything_v2_metric_hypersim_vitl.pth", + "depth-anything-v2-metric-outdoor-small": "depth_anything_v2_metric_vkitti_vits.pth", + "depth-anything-v2-metric-outdoor-base": "depth_anything_v2_metric_vkitti_vitb.pth", + "depth-anything-v2-metric-outdoor-large": "depth_anything_v2_metric_vkitti_vitl.pth", + # v2-giant pending +} + + +@torch.no_grad() +def convert_dpt_checkpoint(model_name, pytorch_dump_folder_path, push_to_hub, verify_logits): + """ + Copy/paste/tweak model's weights to our DPT structure. + """ + + # define DPT configuration + config = get_dpt_config(model_name) + + model_name_to_repo = { + "depth-anything-small": "LiheYoung/depth_anything_vits14", + "depth-anything-base": "LiheYoung/depth_anything_vitb14", + "depth-anything-large": "LiheYoung/depth_anything_vitl14", + "depth-anything-v2-small": "depth-anything/Depth-Anything-V2-Small", + "depth-anything-v2-base": "depth-anything/Depth-Anything-V2-Base", + "depth-anything-v2-large": "depth-anything/Depth-Anything-V2-Large", + "depth-anything-v2-metric-indoor-small": "depth-anything/Depth-Anything-V2-Metric-Hypersim-Small", + "depth-anything-v2-metric-indoor-base": "depth-anything/Depth-Anything-V2-Metric-Hypersim-Base", + "depth-anything-v2-metric-indoor-large": "depth-anything/Depth-Anything-V2-Metric-Hypersim-Large", + "depth-anything-v2-metric-outdoor-small": "depth-anything/Depth-Anything-V2-Metric-VKITTI-Small", + "depth-anything-v2-metric-outdoor-base": "depth-anything/Depth-Anything-V2-Metric-VKITTI-Base", + "depth-anything-v2-metric-outdoor-large": "depth-anything/Depth-Anything-V2-Metric-VKITTI-Large", + } + + # load original state_dict + repo_id = model_name_to_repo[model_name] + filename = name_to_checkpoint[model_name] + filepath = hf_hub_download( + repo_id=repo_id, + filename=f"{filename}", + ) + + state_dict = torch.load(filepath, map_location="cpu", weights_only=True) + # rename keys + rename_keys = create_rename_keys(config) + for src, dest in rename_keys: + rename_key(state_dict, src, dest) + # read in qkv matrices + read_in_q_k_v(state_dict, config) + + # load HuggingFace model + model = DepthAnythingForDepthEstimation(config) + model.load_state_dict(state_dict) + model.eval() + + processor = DPTImageProcessor( + do_resize=True, + size={"height": 518, "width": 518}, + ensure_multiple_of=14, + keep_aspect_ratio=True, + do_rescale=True, + do_normalize=True, + image_mean=[0.485, 0.456, 0.406], + image_std=[0.229, 0.224, 0.225], + ) + + url = "http://images.cocodataset.org/val2017/000000039769.jpg" + image = Image.open(requests.get(url, stream=True).raw) + + pixel_values = processor(image, return_tensors="pt").pixel_values + + # Verify forward pass + with torch.no_grad(): + outputs = model(pixel_values) + predicted_depth = outputs.predicted_depth + + print("Shape of predicted depth:", predicted_depth.shape) + print("First values:", predicted_depth[0, :3, :3]) + + # assert logits + if verify_logits: + expected_shape = torch.Size([1, 518, 686]) + if model_name == "depth-anything-small": + expected_slice = torch.tensor( + [[8.8204, 8.6468, 8.6195], [8.3313, 8.6027, 8.7526], [8.6526, 8.6866, 8.7453]], + ) + elif model_name == "depth-anything-base": + expected_slice = torch.tensor( + [[26.3997, 26.3004, 26.3928], [26.2260, 26.2092, 26.3427], [26.0719, 26.0483, 26.1254]], + ) + elif model_name == "depth-anything-large": + expected_slice = torch.tensor( + [[87.9968, 87.7493, 88.2704], [87.1927, 87.6611, 87.3640], [86.7789, 86.9469, 86.7991]] + ) + elif model_name == "depth-anything-v2-small": + expected_slice = torch.tensor( + [[2.6751, 2.6211, 2.6571], [2.5820, 2.6138, 2.6271], [2.6160, 2.6141, 2.6306]] + ) + elif model_name == "depth-anything-v2-base": + expected_slice = torch.tensor( + [[4.3576, 4.3723, 4.3908], [4.3231, 4.3146, 4.3611], [4.3016, 4.3170, 4.3121]] + ) + elif model_name == "depth-anything-v2-large": + expected_slice = torch.tensor( + [[162.2751, 161.8504, 162.8788], [160.3138, 160.8050, 161.9835], [159.3812, 159.9884, 160.0768]] + ) + elif model_name == "depth-anything-v2-metric-indoor-small": + expected_slice = torch.tensor( + [[1.3349, 1.2946, 1.2801], [1.2793, 1.2337, 1.2899], [1.2629, 1.2218, 1.2476]] + ) + elif model_name == "depth-anything-v2-metric-indoor-base": + expected_slice = torch.tensor( + [[1.4601, 1.3824, 1.4904], [1.5031, 1.4349, 1.4274], [1.4570, 1.4578, 1.4200]] + ) + elif model_name == "depth-anything-v2-metric-indoor-large": + expected_slice = torch.tensor( + [[1.5040, 1.5019, 1.5218], [1.5087, 1.5195, 1.5149], [1.5437, 1.5128, 1.5252]] + ) + elif model_name == "depth-anything-v2-metric-outdoor-small": + expected_slice = torch.tensor( + [[9.5804, 8.0339, 7.7386], [7.9890, 7.2464, 7.7149], [7.7021, 7.2330, 7.3304]] + ) + elif model_name == "depth-anything-v2-metric-outdoor-base": + expected_slice = torch.tensor( + [[10.2916, 9.0933, 8.8622], [9.1964, 9.3393, 9.0644], [8.9618, 9.4201, 9.2262]] + ) + elif model_name == "depth-anything-v2-metric-outdoor-large": + expected_slice = torch.tensor( + [[14.0137, 13.3627, 13.1080], [13.2522, 13.3943, 13.3705], [13.0581, 13.4505, 13.3925]] + ) + else: + raise ValueError("Not supported") + + assert predicted_depth.shape == torch.Size(expected_shape) + assert torch.allclose(predicted_depth[0, :3, :3], expected_slice, atol=1e-4) + print("Looks ok!") + + if pytorch_dump_folder_path is not None: + Path(pytorch_dump_folder_path).mkdir(exist_ok=True) + print(f"Saving model and processor to {pytorch_dump_folder_path}") + model.save_pretrained(pytorch_dump_folder_path) + processor.save_pretrained(pytorch_dump_folder_path) + + if push_to_hub: + print("Pushing model and processor to hub...") + model.push_to_hub(repo_id=f"{model_name.title()}-hf") + processor.push_to_hub(repo_id=f"{model_name.title()}-hf") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + # Required parameters + parser.add_argument( + "--model_name", + default="depth-anything-small", + type=str, + choices=name_to_checkpoint.keys(), + help="Name of the model you'd like to convert.", + ) + parser.add_argument( + "--pytorch_dump_folder_path", + default=None, + type=str, + help="Path to the output PyTorch model directory.", + ) + parser.add_argument( + "--push_to_hub", + action="store_true", + help="Whether to push the model to the hub after conversion.", + ) + parser.add_argument( + "--verify_logits", + action="store_false", + required=False, + help="Whether to verify the logits after conversion.", + ) + + args = parser.parse_args() + convert_dpt_checkpoint(args.model_name, args.pytorch_dump_folder_path, args.push_to_hub, args.verify_logits) diff --git a/transformers/src/transformers/models/depth_anything/convert_distill_any_depth_to_hf.py b/transformers/src/transformers/models/depth_anything/convert_distill_any_depth_to_hf.py new file mode 100644 index 0000000000000000000000000000000000000000..47cec7afac1a7222fbb2779ff51f840fa8c54e1d --- /dev/null +++ b/transformers/src/transformers/models/depth_anything/convert_distill_any_depth_to_hf.py @@ -0,0 +1,246 @@ +# coding=utf-8 +# Copyright 2025 The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Convert Distill Any Depth checkpoints from the original repository. URL: +https://github.com/Westlake-AGI-Lab/Distill-Any-Depth""" + +import argparse +import re +from pathlib import Path + +import requests +import torch +from huggingface_hub import hf_hub_download +from PIL import Image +from safetensors.torch import load_file + +from transformers import DepthAnythingConfig, DepthAnythingForDepthEstimation, Dinov2Config, DPTImageProcessor +from transformers.utils import logging + + +logging.set_verbosity_info() +logger = logging.get_logger(__name__) + + +ORIGINAL_TO_CONVERTED_KEY_MAPPING = { + r"(backbone|pretrained)\.cls_token": r"backbone.embeddings.cls_token", + r"(backbone|pretrained)\.mask_token": r"backbone.embeddings.mask_token", + r"(backbone|pretrained)\.pos_embed": r"backbone.embeddings.position_embeddings", + r"(backbone|pretrained)\.patch_embed\.proj\.(weight|bias)": r"backbone.embeddings.patch_embeddings.projection.\2", + r"(backbone|pretrained)\.norm\.(weight|bias)": r"backbone.layernorm.\2", + r"(backbone|pretrained)(\.blocks(\.\d+)?)?\.(\d+)\.attn\.proj\.(weight|bias)": r"backbone.encoder.layer.\4.attention.output.dense.\5", + r"(backbone|pretrained)(\.blocks(\.\d+)?)?\.(\d+)\.ls(1|2)\.gamma": r"backbone.encoder.layer.\4.layer_scale\5.lambda1", + r"(backbone|pretrained)(\.blocks(\.\d+)?)?\.(\d+)\.mlp\.fc(1|2)\.(weight|bias)": r"backbone.encoder.layer.\4.mlp.fc\5.\6", + r"(backbone|pretrained)(\.blocks(\.\d+)?)?\.(\d+)\.norm(1|2)\.(weight|bias)": r"backbone.encoder.layer.\4.norm\5.\6", + r"depth_head\.projects\.(\d+)\.(weight|bias)": r"neck.reassemble_stage.layers.\1.projection.\2", + r"depth_head\.resize_layers\.(?!2)(\d+)\.(weight|bias)": r"neck.reassemble_stage.layers.\1.resize.\2", + r"depth_head\.scratch\.layer(\d+)_rn\.weight": lambda m: f"neck.convs.{int(m[1]) - 1}.weight", + r"depth_head\.scratch\.output_conv(\d+)(?:\.(\d+))?\.(weight|bias)": lambda m: ( + f"head.conv{int(m[1]) + (int(m[2]) // 2 if m[2] else 0)}.{m[3]}" if m[1] == "2" else f"head.conv{m[1]}.{m[3]}" + ), + r"depth_head\.scratch\.refinenet(\d+)\.out_conv\.(weight|bias)": lambda m: f"neck.fusion_stage.layers.{3 - (int(m[1]) - 1)}.projection.{m[2]}", + r"depth_head\.scratch\.refinenet(\d+)\.resConfUnit(\d+)\.conv(\d+)\.(weight|bias)": lambda m: f"neck.fusion_stage.layers.{3 - (int(m[1]) - 1)}.residual_layer{m[2]}.convolution{m[3]}.{m[4]}", +} + + +def get_dpt_config(model_name): + if "small" in model_name: + out_indices = [3, 6, 9, 12] + backbone_config = Dinov2Config.from_pretrained( + "facebook/dinov2-small", out_indices=out_indices, apply_layernorm=True, reshape_hidden_states=False + ) + fusion_hidden_size = 64 + neck_hidden_sizes = [48, 96, 192, 384] + elif "base" in model_name: + out_indices = [3, 6, 9, 12] + backbone_config = Dinov2Config.from_pretrained( + "facebook/dinov2-base", out_indices=out_indices, apply_layernorm=True, reshape_hidden_states=False + ) + fusion_hidden_size = 128 + neck_hidden_sizes = [96, 192, 384, 768] + elif "large" in model_name: + out_indices = [5, 12, 18, 24] + backbone_config = Dinov2Config.from_pretrained( + "facebook/dinov2-large", out_indices=out_indices, apply_layernorm=True, reshape_hidden_states=False + ) + fusion_hidden_size = 256 + neck_hidden_sizes = [256, 512, 1024, 1024] + else: + raise NotImplementedError(f"Model not supported: {model_name}") + + depth_estimation_type = "relative" + max_depth = None + + config = DepthAnythingConfig( + reassemble_hidden_size=backbone_config.hidden_size, + patch_size=backbone_config.patch_size, + backbone_config=backbone_config, + fusion_hidden_size=fusion_hidden_size, + neck_hidden_sizes=neck_hidden_sizes, + depth_estimation_type=depth_estimation_type, + max_depth=max_depth, + ) + + return config + + +def convert_key_pattern(key, mapping): + for pattern, replacement in mapping.items(): + match = re.fullmatch(pattern, key) + if match: + if callable(replacement): + return replacement(match) + return re.sub(pattern, replacement, key) + return None + + +def convert_keys(state_dict, config): + new_state_dict = {} + qkv_pattern = r"(backbone|pretrained)(\.blocks(\.\d+)?)?\.(\d+)\.attn\.qkv\.(weight|bias)" + qkv_keys = [k for k in list(state_dict.keys()) if re.match(qkv_pattern, k)] + for old_key in qkv_keys: + value = state_dict.pop(old_key) + match = re.match(qkv_pattern, old_key) + _, _, _, layer, attr = match.groups() + hidden_size = config.backbone_config.hidden_size + q = value[:hidden_size] + k = value[hidden_size : hidden_size * 2] + v = value[-hidden_size:] + + for proj, tensor in zip(["query", "key", "value"], [q, k, v]): + new_key = f"backbone.encoder.layer.{layer}.attention.attention.{proj}.{attr}" + new_state_dict[new_key] = tensor + + for old_key in list(state_dict.keys()): + value = state_dict.pop(old_key) + new_key = convert_key_pattern(old_key, ORIGINAL_TO_CONVERTED_KEY_MAPPING) + + new_state_dict[new_key] = value + + return new_state_dict + + +def prepare_img(): + url = "http://images.cocodataset.org/val2017/000000039769.jpg" + return Image.open(requests.get(url, stream=True).raw) + + +name_to_checkpoint = { + "distill-any-depth-small": "small/model.safetensors", + "distill-any-depth-base": "base/model.safetensors", + "distill-any-depth-large": "large/model.safetensors", +} + + +@torch.no_grad() +def convert_dpt_checkpoint(model_name, pytorch_dump_folder_path, push_to_hub, verify_logits): + config = get_dpt_config(model_name) + + repo_id = "xingyang1/Distill-Any-Depth" + filepath = hf_hub_download(repo_id=repo_id, filename=name_to_checkpoint[model_name]) + state_dict = load_file(filepath) + + converted_state_dict = convert_keys(state_dict, config) + + model = DepthAnythingForDepthEstimation(config) + model.load_state_dict(converted_state_dict) + model.eval() + + processor = DPTImageProcessor( + do_resize=True, + size={"height": 518, "width": 518}, + ensure_multiple_of=14, + keep_aspect_ratio=True, + do_rescale=True, + do_normalize=True, + image_mean=[0.485, 0.456, 0.406], + image_std=[0.229, 0.224, 0.225], + ) + + url = "http://images.cocodataset.org/val2017/000000039769.jpg" + image = Image.open(requests.get(url, stream=True).raw) + + pixel_values = processor(image, return_tensors="pt").pixel_values + + with torch.no_grad(): + outputs = model(pixel_values) + predicted_depth = outputs.predicted_depth + + print("Shape of predicted depth:", predicted_depth.shape) + print("First values:", predicted_depth[0, :3, :3]) + + if verify_logits: + print("Verifying logits...") + expected_shape = torch.Size([1, 518, 686]) + + if model_name == "distill-any-depth-small": + expected_slice = torch.tensor( + [[2.5653, 2.5249, 2.5570], [2.4897, 2.5235, 2.5355], [2.5255, 2.5261, 2.5422]] + ) + elif model_name == "distill-any-depth-base": + expected_slice = torch.tensor( + [[4.8976, 4.9075, 4.9403], [4.8872, 4.8906, 4.9448], [4.8712, 4.8898, 4.8838]] + ) + elif model_name == "distill-any-depth-large": + expected_slice = torch.tensor( + [[55.1067, 51.1828, 51.6803], [51.9098, 50.7529, 51.4494], [50.1745, 50.5491, 50.8818]] + ) + else: + raise ValueError("Not supported") + + assert predicted_depth.shape == torch.Size(expected_shape) + assert torch.allclose(predicted_depth[0, :3, :3], expected_slice, atol=1e-4) + print("Looks ok!") + + if pytorch_dump_folder_path is not None: + Path(pytorch_dump_folder_path).mkdir(exist_ok=True) + print(f"Saving model and processor to {pytorch_dump_folder_path}") + model.save_pretrained(pytorch_dump_folder_path) + processor.save_pretrained(pytorch_dump_folder_path) + + if push_to_hub: + print("Pushing model and processor to hub...") + model.push_to_hub(repo_id=f"{model_name.title()}-hf") + processor.push_to_hub(repo_id=f"{model_name.title()}-hf") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument( + "--model_name", + default="distill-any-depth-small", + type=str, + choices=name_to_checkpoint.keys(), + help="Name of the model you'd like to convert.", + ) + parser.add_argument( + "--pytorch_dump_folder_path", + default=None, + type=str, + help="Path to the output PyTorch model directory.", + ) + parser.add_argument( + "--push_to_hub", + action="store_true", + help="Whether to push the model to the hub after conversion.", + ) + parser.add_argument( + "--verify_logits", + action="store_true", + required=False, + help="Whether to verify the logits after conversion.", + ) + + args = parser.parse_args() + convert_dpt_checkpoint(args.model_name, args.pytorch_dump_folder_path, args.push_to_hub, args.verify_logits) diff --git a/transformers/src/transformers/models/depth_anything/modeling_depth_anything.py b/transformers/src/transformers/models/depth_anything/modeling_depth_anything.py new file mode 100644 index 0000000000000000000000000000000000000000..f5ace462120a19c0c41ad3c4d0c3a68bb440269b --- /dev/null +++ b/transformers/src/transformers/models/depth_anything/modeling_depth_anything.py @@ -0,0 +1,428 @@ +# coding=utf-8 +# Copyright 2024 TikTok and The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""PyTorch Depth Anything model.""" + +from typing import Optional, Union + +import torch +import torch.utils.checkpoint +from torch import nn + +from ...modeling_outputs import DepthEstimatorOutput +from ...modeling_utils import PreTrainedModel +from ...utils import auto_docstring, logging +from ...utils.backbone_utils import load_backbone +from .configuration_depth_anything import DepthAnythingConfig + + +logger = logging.get_logger(__name__) + +# General docstring + + +class DepthAnythingReassembleLayer(nn.Module): + def __init__(self, config, channels, factor): + super().__init__() + self.projection = nn.Conv2d(in_channels=config.reassemble_hidden_size, out_channels=channels, kernel_size=1) + + # up/down sampling depending on factor + if factor > 1: + self.resize = nn.ConvTranspose2d(channels, channels, kernel_size=factor, stride=factor, padding=0) + elif factor == 1: + self.resize = nn.Identity() + elif factor < 1: + # so should downsample + self.resize = nn.Conv2d(channels, channels, kernel_size=3, stride=int(1 / factor), padding=1) + + # Copied from transformers.models.dpt.modeling_dpt.DPTReassembleLayer.forward + def forward(self, hidden_state): + hidden_state = self.projection(hidden_state) + hidden_state = self.resize(hidden_state) + + return hidden_state + + +class DepthAnythingReassembleStage(nn.Module): + """ + This class reassembles the hidden states of the backbone into image-like feature representations at various + resolutions. + + This happens in 3 stages: + 1. Take the patch embeddings and reshape them to image-like feature representations. + 2. Project the channel dimension of the hidden states according to `config.neck_hidden_sizes`. + 3. Resizing the spatial dimensions (height, width). + + Args: + config (`[DepthAnythingConfig]`): + Model configuration class defining the model architecture. + """ + + def __init__(self, config): + super().__init__() + + self.config = config + self.layers = nn.ModuleList() + for channels, factor in zip(config.neck_hidden_sizes, config.reassemble_factors): + self.layers.append(DepthAnythingReassembleLayer(config, channels=channels, factor=factor)) + + def forward(self, hidden_states: list[torch.Tensor], patch_height=None, patch_width=None) -> list[torch.Tensor]: + """ + Args: + hidden_states (`list[torch.FloatTensor]`, each of shape `(batch_size, sequence_length + 1, hidden_size)`): + List of hidden states from the backbone. + """ + out = [] + + for i, hidden_state in enumerate(hidden_states): + # reshape to (batch_size, num_channels, height, width) + hidden_state = hidden_state[:, 1:] + batch_size, _, num_channels = hidden_state.shape + hidden_state = hidden_state.reshape(batch_size, patch_height, patch_width, num_channels) + hidden_state = hidden_state.permute(0, 3, 1, 2).contiguous() + hidden_state = self.layers[i](hidden_state) + out.append(hidden_state) + + return out + + +class DepthAnythingPreActResidualLayer(nn.Module): + """ + ResidualConvUnit, pre-activate residual unit. + + Args: + config (`[DepthAnythingConfig]`): + Model configuration class defining the model architecture. + """ + + def __init__(self, config): + super().__init__() + + self.activation1 = nn.ReLU() + self.convolution1 = nn.Conv2d( + config.fusion_hidden_size, + config.fusion_hidden_size, + kernel_size=3, + stride=1, + padding=1, + bias=True, + ) + + self.activation2 = nn.ReLU() + self.convolution2 = nn.Conv2d( + config.fusion_hidden_size, + config.fusion_hidden_size, + kernel_size=3, + stride=1, + padding=1, + bias=True, + ) + + def forward(self, hidden_state: torch.Tensor) -> torch.Tensor: + residual = hidden_state + hidden_state = self.activation1(hidden_state) + hidden_state = self.convolution1(hidden_state) + hidden_state = self.activation2(hidden_state) + hidden_state = self.convolution2(hidden_state) + + return hidden_state + residual + + +class DepthAnythingFeatureFusionLayer(nn.Module): + """Feature fusion layer, merges feature maps from different stages. + + Args: + config (`[DepthAnythingConfig]`): + Model configuration class defining the model architecture. + """ + + def __init__(self, config): + super().__init__() + + self.projection = nn.Conv2d(config.fusion_hidden_size, config.fusion_hidden_size, kernel_size=1, bias=True) + + self.residual_layer1 = DepthAnythingPreActResidualLayer(config) + self.residual_layer2 = DepthAnythingPreActResidualLayer(config) + + def forward(self, hidden_state, residual=None, size=None): + if residual is not None: + if hidden_state.shape != residual.shape: + residual = nn.functional.interpolate( + residual, size=(hidden_state.shape[2], hidden_state.shape[3]), mode="bilinear", align_corners=False + ) + hidden_state = hidden_state + self.residual_layer1(residual) + + hidden_state = self.residual_layer2(hidden_state) + + modifier = {"scale_factor": 2} if size is None else {"size": size} + + hidden_state = nn.functional.interpolate( + hidden_state, + **modifier, + mode="bilinear", + align_corners=True, + ) + hidden_state = self.projection(hidden_state) + + return hidden_state + + +class DepthAnythingFeatureFusionStage(nn.Module): + # Copied from transformers.models.dpt.modeling_dpt.DPTFeatureFusionStage.__init__ with DPT->DepthAnything + def __init__(self, config): + super().__init__() + self.layers = nn.ModuleList() + for _ in range(len(config.neck_hidden_sizes)): + self.layers.append(DepthAnythingFeatureFusionLayer(config)) + + def forward(self, hidden_states, size=None): + # reversing the hidden_states, we start from the last + hidden_states = hidden_states[::-1] + + fused_hidden_states = [] + fused_hidden_state = None + + for idx, (hidden_state, layer) in enumerate(zip(hidden_states, self.layers)): + size = hidden_states[idx + 1].shape[2:] if idx != (len(hidden_states) - 1) else None + + if fused_hidden_state is None: + # first layer only uses the last hidden_state + fused_hidden_state = layer(hidden_state, size=size) + else: + fused_hidden_state = layer(fused_hidden_state, hidden_state, size=size) + + fused_hidden_states.append(fused_hidden_state) + + return fused_hidden_states + + +# Modified from transformers.models.dpt.modeling_dpt.DPTPreTrainedModel with DPT->DepthAnything,dpt->depth_anything +# avoiding sdpa and flash_attn_2 support, it's done in the backend +@auto_docstring +class DepthAnythingPreTrainedModel(PreTrainedModel): + config_class = DepthAnythingConfig + base_model_prefix = "depth_anything" + main_input_name = "pixel_values" + supports_gradient_checkpointing = True + + def _init_weights(self, module): + """Initialize the weights""" + if isinstance(module, (nn.Linear, nn.Conv2d, nn.ConvTranspose2d)): + # Slightly different from the TF version which uses truncated_normal for initialization + # cf https://github.com/pytorch/pytorch/pull/5617 + module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.LayerNorm): + module.bias.data.zero_() + module.weight.data.fill_(1.0) + + +class DepthAnythingNeck(nn.Module): + """ + DepthAnythingNeck. A neck is a module that is normally used between the backbone and the head. It takes a list of tensors as + input and produces another list of tensors as output. For DepthAnything, it includes 2 stages: + + * DepthAnythingReassembleStage + * DepthAnythingFeatureFusionStage. + + Args: + config (dict): config dict. + """ + + def __init__(self, config): + super().__init__() + self.config = config + + self.reassemble_stage = DepthAnythingReassembleStage(config) + + self.convs = nn.ModuleList() + for channel in config.neck_hidden_sizes: + self.convs.append(nn.Conv2d(channel, config.fusion_hidden_size, kernel_size=3, padding=1, bias=False)) + + # fusion + self.fusion_stage = DepthAnythingFeatureFusionStage(config) + + def forward(self, hidden_states: list[torch.Tensor], patch_height=None, patch_width=None) -> list[torch.Tensor]: + """ + Args: + hidden_states (`list[torch.FloatTensor]`, each of shape `(batch_size, sequence_length, hidden_size)` or `(batch_size, hidden_size, height, width)`): + List of hidden states from the backbone. + """ + if not isinstance(hidden_states, (tuple, list)): + raise TypeError("hidden_states should be a tuple or list of tensors") + + if len(hidden_states) != len(self.config.neck_hidden_sizes): + raise ValueError("The number of hidden states should be equal to the number of neck hidden sizes.") + + # postprocess hidden states + hidden_states = self.reassemble_stage(hidden_states, patch_height, patch_width) + + features = [self.convs[i](feature) for i, feature in enumerate(hidden_states)] + + # fusion blocks + output = self.fusion_stage(features) + + return output + + +class DepthAnythingDepthEstimationHead(nn.Module): + """ + Output head consisting of 3 convolutional layers. It progressively halves the feature dimension and upsamples + the predictions to the input resolution after the first convolutional layer (details can be found in the DPT paper's + supplementary material). The final activation function is either ReLU or Sigmoid, depending on the depth estimation + type (relative or metric). For metric depth estimation, the output is scaled by the maximum depth used during pretraining. + """ + + def __init__(self, config): + super().__init__() + + self.head_in_index = config.head_in_index + self.patch_size = config.patch_size + + features = config.fusion_hidden_size + self.conv1 = nn.Conv2d(features, features // 2, kernel_size=3, stride=1, padding=1) + self.conv2 = nn.Conv2d(features // 2, config.head_hidden_size, kernel_size=3, stride=1, padding=1) + self.activation1 = nn.ReLU() + self.conv3 = nn.Conv2d(config.head_hidden_size, 1, kernel_size=1, stride=1, padding=0) + if config.depth_estimation_type == "relative": + self.activation2 = nn.ReLU() + elif config.depth_estimation_type == "metric": + self.activation2 = nn.Sigmoid() + else: + raise ValueError(f"Unknown depth estimation type: {config.depth_estimation_type}") + self.max_depth = config.max_depth + + def forward(self, hidden_states: list[torch.Tensor], patch_height, patch_width) -> torch.Tensor: + hidden_states = hidden_states[self.head_in_index] + + predicted_depth = self.conv1(hidden_states) + predicted_depth = nn.functional.interpolate( + predicted_depth, + (int(patch_height * self.patch_size), int(patch_width * self.patch_size)), + mode="bilinear", + align_corners=True, + ) + predicted_depth = self.conv2(predicted_depth) + predicted_depth = self.activation1(predicted_depth) + predicted_depth = self.conv3(predicted_depth) + predicted_depth = self.activation2(predicted_depth) * self.max_depth + predicted_depth = predicted_depth.squeeze(dim=1) # shape (batch_size, height, width) + + return predicted_depth + + +@auto_docstring( + custom_intro=""" + Depth Anything Model with a depth estimation head on top (consisting of 3 convolutional layers) e.g. for KITTI, NYUv2. + """ +) +class DepthAnythingForDepthEstimation(DepthAnythingPreTrainedModel): + _no_split_modules = ["DPTViTEmbeddings"] + + def __init__(self, config): + super().__init__(config) + + self.backbone = load_backbone(config) + self.neck = DepthAnythingNeck(config) + self.head = DepthAnythingDepthEstimationHead(config) + + # Initialize weights and apply final processing + self.post_init() + + @auto_docstring + def forward( + self, + pixel_values: torch.FloatTensor, + labels: Optional[torch.LongTensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[tuple[torch.Tensor], DepthEstimatorOutput]: + r""" + labels (`torch.LongTensor` of shape `(batch_size, height, width)`, *optional*): + Ground truth depth estimation maps for computing the loss. + + Examples: + ```python + >>> from transformers import AutoImageProcessor, AutoModelForDepthEstimation + >>> import torch + >>> import numpy as np + >>> from PIL import Image + >>> import requests + + >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" + >>> image = Image.open(requests.get(url, stream=True).raw) + + >>> image_processor = AutoImageProcessor.from_pretrained("LiheYoung/depth-anything-small-hf") + >>> model = AutoModelForDepthEstimation.from_pretrained("LiheYoung/depth-anything-small-hf") + + >>> # prepare image for the model + >>> inputs = image_processor(images=image, return_tensors="pt") + + >>> with torch.no_grad(): + ... outputs = model(**inputs) + + >>> # interpolate to original size + >>> post_processed_output = image_processor.post_process_depth_estimation( + ... outputs, + ... target_sizes=[(image.height, image.width)], + ... ) + + >>> # visualize the prediction + >>> predicted_depth = post_processed_output[0]["predicted_depth"] + >>> depth = predicted_depth * 255 / predicted_depth.max() + >>> depth = depth.detach().cpu().numpy() + >>> depth = Image.fromarray(depth.astype("uint8")) + ```""" + loss = None + if labels is not None: + raise NotImplementedError("Training is not implemented yet") + + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + + outputs = self.backbone.forward_with_filtered_kwargs( + pixel_values, output_hidden_states=output_hidden_states, output_attentions=output_attentions + ) + hidden_states = outputs.feature_maps + + _, _, height, width = pixel_values.shape + patch_size = self.config.patch_size + patch_height = height // patch_size + patch_width = width // patch_size + + hidden_states = self.neck(hidden_states, patch_height, patch_width) + + predicted_depth = self.head(hidden_states, patch_height, patch_width) + + if not return_dict: + if output_hidden_states: + output = (predicted_depth,) + outputs[1:] + else: + output = (predicted_depth,) + outputs[2:] + return ((loss,) + output) if loss is not None else output + + return DepthEstimatorOutput( + loss=loss, + predicted_depth=predicted_depth, + hidden_states=outputs.hidden_states if output_hidden_states else None, + attentions=outputs.attentions, + ) + + +__all__ = ["DepthAnythingForDepthEstimation", "DepthAnythingPreTrainedModel"] diff --git a/transformers/src/transformers/models/dialogpt/__init__.py b/transformers/src/transformers/models/dialogpt/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/transformers/src/transformers/models/dialogpt/convert_dialogpt_original_pytorch_checkpoint_to_pytorch.py b/transformers/src/transformers/models/dialogpt/convert_dialogpt_original_pytorch_checkpoint_to_pytorch.py new file mode 100644 index 0000000000000000000000000000000000000000..03f38084cfbf7428678e0ec7b25d0fe2ae9dace1 --- /dev/null +++ b/transformers/src/transformers/models/dialogpt/convert_dialogpt_original_pytorch_checkpoint_to_pytorch.py @@ -0,0 +1,46 @@ +# Copyright 2020 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse +import os + +import torch + +from transformers.utils import WEIGHTS_NAME + + +DIALOGPT_MODELS = ["small", "medium", "large"] + +OLD_KEY = "lm_head.decoder.weight" +NEW_KEY = "lm_head.weight" + + +def convert_dialogpt_checkpoint(checkpoint_path: str, pytorch_dump_folder_path: str): + d = torch.load(checkpoint_path, weights_only=True) + d[NEW_KEY] = d.pop(OLD_KEY) + os.makedirs(pytorch_dump_folder_path, exist_ok=True) + torch.save(d, os.path.join(pytorch_dump_folder_path, WEIGHTS_NAME)) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--dialogpt_path", default=".", type=str) + args = parser.parse_args() + for MODEL in DIALOGPT_MODELS: + checkpoint_path = os.path.join(args.dialogpt_path, f"{MODEL}_ft.pkl") + pytorch_dump_folder_path = f"./DialoGPT-{MODEL}" + convert_dialogpt_checkpoint( + checkpoint_path, + pytorch_dump_folder_path, + ) diff --git a/transformers/src/transformers/models/dinov2/__init__.py b/transformers/src/transformers/models/dinov2/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..3cc316957eac509573bf44785209d0729ea13bb6 --- /dev/null +++ b/transformers/src/transformers/models/dinov2/__init__.py @@ -0,0 +1,28 @@ +# Copyright 2024 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import TYPE_CHECKING + +from ...utils import _LazyModule +from ...utils.import_utils import define_import_structure + + +if TYPE_CHECKING: + from .configuration_dinov2 import * + from .modeling_dinov2 import * + from .modeling_flax_dinov2 import * +else: + import sys + + _file = globals()["__file__"] + sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__) diff --git a/transformers/src/transformers/models/dinov2/configuration_dinov2.py b/transformers/src/transformers/models/dinov2/configuration_dinov2.py new file mode 100644 index 0000000000000000000000000000000000000000..55fa0539a23bf0c02d0079ce41f0c5228b88c904 --- /dev/null +++ b/transformers/src/transformers/models/dinov2/configuration_dinov2.py @@ -0,0 +1,179 @@ +# coding=utf-8 +# Copyright 2023 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""DINOv2 model configuration""" + +from collections import OrderedDict +from collections.abc import Mapping + +from packaging import version + +from ...configuration_utils import PretrainedConfig +from ...onnx import OnnxConfig +from ...utils import logging +from ...utils.backbone_utils import BackboneConfigMixin, get_aligned_output_features_output_indices + + +logger = logging.get_logger(__name__) + + +class Dinov2Config(BackboneConfigMixin, PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`Dinov2Model`]. It is used to instantiate an + Dinov2 model according to the specified arguments, defining the model architecture. Instantiating a configuration + with the defaults will yield a similar configuration to that of the Dinov2 + [google/dinov2-base-patch16-224](https://huggingface.co/google/dinov2-base-patch16-224) architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + Args: + hidden_size (`int`, *optional*, defaults to 768): + Dimensionality of the encoder layers and the pooler layer. + num_hidden_layers (`int`, *optional*, defaults to 12): + Number of hidden layers in the Transformer encoder. + num_attention_heads (`int`, *optional*, defaults to 12): + Number of attention heads for each attention layer in the Transformer encoder. + mlp_ratio (`int`, *optional*, defaults to 4): + Ratio of the hidden size of the MLPs relative to the `hidden_size`. + hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`): + The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`, + `"relu"`, `"selu"` and `"gelu_new"` are supported. + hidden_dropout_prob (`float`, *optional*, defaults to 0.0): + The dropout probability for all fully connected layers in the embeddings, encoder, and pooler. + attention_probs_dropout_prob (`float`, *optional*, defaults to 0.0): + The dropout ratio for the attention probabilities. + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + layer_norm_eps (`float`, *optional*, defaults to 1e-06): + The epsilon used by the layer normalization layers. + image_size (`int`, *optional*, defaults to 224): + The size (resolution) of each image. + patch_size (`int`, *optional*, defaults to 14): + The size (resolution) of each patch. + num_channels (`int`, *optional*, defaults to 3): + The number of input channels. + qkv_bias (`bool`, *optional*, defaults to `True`): + Whether to add a bias to the queries, keys and values. + layerscale_value (`float`, *optional*, defaults to 1.0): + Initial value to use for layer scale. + drop_path_rate (`float`, *optional*, defaults to 0.0): + Stochastic depth rate per sample (when applied in the main path of residual layers). + use_swiglu_ffn (`bool`, *optional*, defaults to `False`): + Whether to use the SwiGLU feedforward neural network. + out_features (`list[str]`, *optional*): + If used as backbone, list of features to output. Can be any of `"stem"`, `"stage1"`, `"stage2"`, etc. + (depending on how many stages the model has). If unset and `out_indices` is set, will default to the + corresponding stages. If unset and `out_indices` is unset, will default to the last stage. Must be in the + same order as defined in the `stage_names` attribute. + out_indices (`list[int]`, *optional*): + If used as backbone, list of indices of features to output. Can be any of 0, 1, 2, etc. (depending on how + many stages the model has). If unset and `out_features` is set, will default to the corresponding stages. + If unset and `out_features` is unset, will default to the last stage. Must be in the + same order as defined in the `stage_names` attribute. + apply_layernorm (`bool`, *optional*, defaults to `True`): + Whether to apply layer normalization to the feature maps in case the model is used as backbone. + reshape_hidden_states (`bool`, *optional*, defaults to `True`): + Whether to reshape the feature maps to 4D tensors of shape `(batch_size, hidden_size, height, width)` in + case the model is used as backbone. If `False`, the feature maps will be 3D tensors of shape `(batch_size, + seq_len, hidden_size)`. + use_mask_token (`bool`, *optional*, defaults to `True`): + Whether to use mask_token in embeddings. + + Example: + + ```python + >>> from transformers import Dinov2Config, Dinov2Model + + >>> # Initializing a Dinov2 dinov2-base-patch16-224 style configuration + >>> configuration = Dinov2Config() + + >>> # Initializing a model (with random weights) from the dinov2-base-patch16-224 style configuration + >>> model = Dinov2Model(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + + model_type = "dinov2" + + def __init__( + self, + hidden_size=768, + num_hidden_layers=12, + num_attention_heads=12, + mlp_ratio=4, + hidden_act="gelu", + hidden_dropout_prob=0.0, + attention_probs_dropout_prob=0.0, + initializer_range=0.02, + layer_norm_eps=1e-6, + image_size=224, + patch_size=14, + num_channels=3, + qkv_bias=True, + layerscale_value=1.0, + drop_path_rate=0.0, + use_swiglu_ffn=False, + out_features=None, + out_indices=None, + apply_layernorm=True, + reshape_hidden_states=True, + use_mask_token=True, + **kwargs, + ): + super().__init__(**kwargs) + + self.hidden_size = hidden_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.mlp_ratio = mlp_ratio + self.hidden_act = hidden_act + self.hidden_dropout_prob = hidden_dropout_prob + self.attention_probs_dropout_prob = attention_probs_dropout_prob + self.initializer_range = initializer_range + self.layer_norm_eps = layer_norm_eps + self.image_size = image_size + self.patch_size = patch_size + self.num_channels = num_channels + self.qkv_bias = qkv_bias + self.layerscale_value = layerscale_value + self.drop_path_rate = drop_path_rate + self.use_swiglu_ffn = use_swiglu_ffn + self.stage_names = ["stem"] + [f"stage{idx}" for idx in range(1, num_hidden_layers + 1)] + self._out_features, self._out_indices = get_aligned_output_features_output_indices( + out_features=out_features, out_indices=out_indices, stage_names=self.stage_names + ) + self.apply_layernorm = apply_layernorm + self.reshape_hidden_states = reshape_hidden_states + self.use_mask_token = use_mask_token + + +class Dinov2OnnxConfig(OnnxConfig): + torch_onnx_minimum_version = version.parse("1.11") + + @property + def inputs(self) -> Mapping[str, Mapping[int, str]]: + return OrderedDict( + [ + ("pixel_values", {0: "batch", 1: "num_channels", 2: "height", 3: "width"}), + ] + ) + + @property + def atol_for_validation(self) -> float: + return 1e-4 + + +__all__ = ["Dinov2Config", "Dinov2OnnxConfig"] diff --git a/transformers/src/transformers/models/dinov2/convert_dinov2_to_hf.py b/transformers/src/transformers/models/dinov2/convert_dinov2_to_hf.py new file mode 100644 index 0000000000000000000000000000000000000000..d716191b2fcbd4775bd2349ef98a7ad0d781a90c --- /dev/null +++ b/transformers/src/transformers/models/dinov2/convert_dinov2_to_hf.py @@ -0,0 +1,285 @@ +# coding=utf-8 +# Copyright 2023 The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Convert DINOv2 checkpoints from the original repository. + +URL: https://github.com/facebookresearch/dinov2/tree/main +""" + +import argparse +import json +from pathlib import Path + +import requests +import torch +import torch.nn as nn +from huggingface_hub import hf_hub_download +from PIL import Image +from torchvision import transforms + +from transformers import BitImageProcessor, Dinov2Config, Dinov2ForImageClassification, Dinov2Model +from transformers.image_utils import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD, PILImageResampling +from transformers.utils import logging + + +logging.set_verbosity_info() +logger = logging.get_logger(__name__) + + +def get_dinov2_config(model_name, image_classifier=False): + config = Dinov2Config(image_size=518, patch_size=14) + + # size of the architecture + if "vits" in model_name: + config.hidden_size = 384 + config.num_attention_heads = 6 + elif "vitb" in model_name: + pass + elif "vitl" in model_name: + config.hidden_size = 1024 + config.num_hidden_layers = 24 + config.num_attention_heads = 16 + elif "vitg" in model_name: + config.use_swiglu_ffn = True + config.hidden_size = 1536 + config.num_hidden_layers = 40 + config.num_attention_heads = 24 + else: + raise ValueError("Model not supported") + + if image_classifier: + repo_id = "huggingface/label-files" + filename = "imagenet-1k-id2label.json" + config.num_labels = 1000 + config.id2label = json.load(open(hf_hub_download(repo_id, filename, repo_type="dataset"), "r")) + config.id2label = {int(k): v for k, v in config.id2label.items()} + + return config + + +def create_rename_keys(config): + rename_keys = [] + # fmt: off + + # patch embedding layer + rename_keys.append(("cls_token", "embeddings.cls_token")) + rename_keys.append(("mask_token", "embeddings.mask_token")) + rename_keys.append(("pos_embed", "embeddings.position_embeddings")) + rename_keys.append(("patch_embed.proj.weight", "embeddings.patch_embeddings.projection.weight")) + rename_keys.append(("patch_embed.proj.bias", "embeddings.patch_embeddings.projection.bias")) + + for i in range(config.num_hidden_layers): + # layernorms + rename_keys.append((f"blocks.{i}.norm1.weight", f"encoder.layer.{i}.norm1.weight")) + rename_keys.append((f"blocks.{i}.norm1.bias", f"encoder.layer.{i}.norm1.bias")) + rename_keys.append((f"blocks.{i}.norm2.weight", f"encoder.layer.{i}.norm2.weight")) + rename_keys.append((f"blocks.{i}.norm2.bias", f"encoder.layer.{i}.norm2.bias")) + # MLP + if config.use_swiglu_ffn: + rename_keys.append((f"blocks.{i}.mlp.w12.weight", f"encoder.layer.{i}.mlp.w12.weight")) + rename_keys.append((f"blocks.{i}.mlp.w12.bias", f"encoder.layer.{i}.mlp.w12.bias")) + rename_keys.append((f"blocks.{i}.mlp.w3.weight", f"encoder.layer.{i}.mlp.w3.weight")) + rename_keys.append((f"blocks.{i}.mlp.w3.bias", f"encoder.layer.{i}.mlp.w3.bias")) + else: + rename_keys.append((f"blocks.{i}.mlp.fc1.weight", f"encoder.layer.{i}.mlp.fc1.weight")) + rename_keys.append((f"blocks.{i}.mlp.fc1.bias", f"encoder.layer.{i}.mlp.fc1.bias")) + rename_keys.append((f"blocks.{i}.mlp.fc2.weight", f"encoder.layer.{i}.mlp.fc2.weight")) + rename_keys.append((f"blocks.{i}.mlp.fc2.bias", f"encoder.layer.{i}.mlp.fc2.bias")) + # layerscale + rename_keys.append((f"blocks.{i}.ls1.gamma", f"encoder.layer.{i}.layer_scale1.lambda1")) + rename_keys.append((f"blocks.{i}.ls2.gamma", f"encoder.layer.{i}.layer_scale2.lambda1")) + # attention projection layer + rename_keys.append((f"blocks.{i}.attn.proj.weight", f"encoder.layer.{i}.attention.output.dense.weight")) + rename_keys.append((f"blocks.{i}.attn.proj.bias", f"encoder.layer.{i}.attention.output.dense.bias")) + + # final layernorm + rename_keys.append(("norm.weight", "layernorm.weight")) + rename_keys.append(("norm.bias", "layernorm.bias")) + + # fmt: on + return rename_keys + + +def rename_key(dct, old, new): + val = dct.pop(old) + dct[new] = val + + +# we split up the matrix of each encoder layer into queries, keys and values +def read_in_q_k_v(state_dict, config): + for i in range(config.num_hidden_layers): + # read in weights + bias of input projection layer (in timm, this is a single matrix + bias) + in_proj_weight = state_dict.pop(f"blocks.{i}.attn.qkv.weight") + in_proj_bias = state_dict.pop(f"blocks.{i}.attn.qkv.bias") + # next, add query, keys and values (in that order) to the state dict + state_dict[f"encoder.layer.{i}.attention.attention.query.weight"] = in_proj_weight[: config.hidden_size, :] + state_dict[f"encoder.layer.{i}.attention.attention.query.bias"] = in_proj_bias[: config.hidden_size] + state_dict[f"encoder.layer.{i}.attention.attention.key.weight"] = in_proj_weight[ + config.hidden_size : config.hidden_size * 2, : + ] + state_dict[f"encoder.layer.{i}.attention.attention.key.bias"] = in_proj_bias[ + config.hidden_size : config.hidden_size * 2 + ] + state_dict[f"encoder.layer.{i}.attention.attention.value.weight"] = in_proj_weight[-config.hidden_size :, :] + state_dict[f"encoder.layer.{i}.attention.attention.value.bias"] = in_proj_bias[-config.hidden_size :] + + +# We will verify our results on an image of cute cats +def prepare_img(): + url = "http://images.cocodataset.org/val2017/000000039769.jpg" + image = Image.open(requests.get(url, stream=True).raw).convert("RGB") + return image + + +@torch.no_grad() +def convert_dinov2_checkpoint(model_name, pytorch_dump_folder_path, push_to_hub=False): + """ + Copy/paste/tweak model's weights to our DINOv2 structure. + """ + + # define default Dinov2 configuration + image_classifier = "1layer" in model_name + config = get_dinov2_config(model_name, image_classifier=image_classifier) + + # load original model from torch hub + original_model = torch.hub.load("facebookresearch/dinov2", model_name.replace("_1layer", "")) + original_model.eval() + + # load state_dict of original model, remove and rename some keys + state_dict = original_model.state_dict() + rename_keys = create_rename_keys(config) + for src, dest in rename_keys: + rename_key(state_dict, src, dest) + read_in_q_k_v(state_dict, config) + + for key, val in state_dict.copy().items(): + val = state_dict.pop(key) + if "w12" in key: + key = key.replace("w12", "weights_in") + if "w3" in key: + key = key.replace("w3", "weights_out") + state_dict[key] = val + + # load HuggingFace model + if image_classifier: + model = Dinov2ForImageClassification(config).eval() + model.dinov2.load_state_dict(state_dict) + model_name_to_classifier_dict_url = { + "dinov2_vits14_1layer": "https://dl.fbaipublicfiles.com/dinov2/dinov2_vits14/dinov2_vits14_linear_head.pth", + "dinov2_vitb14_1layer": "https://dl.fbaipublicfiles.com/dinov2/dinov2_vitb14/dinov2_vitb14_linear_head.pth", + "dinov2_vitl14_1layer": "https://dl.fbaipublicfiles.com/dinov2/dinov2_vitl14/dinov2_vitl14_linear_head.pth", + "dinov2_vitg14_1layer": "https://dl.fbaipublicfiles.com/dinov2/dinov2_vitg14/dinov2_vitg14_linear_head.pth", + } + url = model_name_to_classifier_dict_url[model_name] + classifier_state_dict = torch.hub.load_state_dict_from_url(url, map_location="cpu") + model.classifier.weight = nn.Parameter(classifier_state_dict["weight"]) + model.classifier.bias = nn.Parameter(classifier_state_dict["bias"]) + else: + model = Dinov2Model(config).eval() + model.load_state_dict(state_dict) + + # load image + image = prepare_img() + + # preprocess image + transformations = transforms.Compose( + [ + transforms.Resize(256, interpolation=transforms.InterpolationMode.BICUBIC), + transforms.CenterCrop(224), + transforms.ToTensor(), + transforms.Normalize( + mean=IMAGENET_DEFAULT_MEAN, # these are RGB mean+std values + std=IMAGENET_DEFAULT_STD, # across a large photo dataset. + ), + ] + ) + + original_pixel_values = transformations(image).unsqueeze(0) # insert batch dimension + + processor = BitImageProcessor( + size={"shortest_edge": 256}, + resample=PILImageResampling.BICUBIC, + image_mean=IMAGENET_DEFAULT_MEAN, + image_std=IMAGENET_DEFAULT_STD, + ) + pixel_values = processor(image, return_tensors="pt").pixel_values + + assert torch.allclose(original_pixel_values, pixel_values) + + with torch.no_grad(): + outputs = model(pixel_values, output_hidden_states=True) + original_outputs = original_model(pixel_values) + + # assert values + if image_classifier: + print("Predicted class:") + class_idx = outputs.logits.argmax(-1).item() + print(model.config.id2label[class_idx]) + else: + assert outputs.last_hidden_state[:, 0].shape == original_outputs.shape + assert torch.allclose(outputs.last_hidden_state[:, 0], original_outputs, atol=1e-3) + print("Looks ok!") + + if pytorch_dump_folder_path is not None: + Path(pytorch_dump_folder_path).mkdir(exist_ok=True) + print(f"Saving model {model_name} to {pytorch_dump_folder_path}") + model.save_pretrained(pytorch_dump_folder_path) + print(f"Saving image processor to {pytorch_dump_folder_path}") + processor.save_pretrained(pytorch_dump_folder_path) + + if push_to_hub: + model_name_to_hf_name = { + "dinov2_vits14": "dinov2-small", + "dinov2_vitb14": "dinov2-base", + "dinov2_vitl14": "dinov2-large", + "dinov2_vitg14": "dinov2-giant", + "dinov2_vits14_1layer": "dinov2-small-imagenet1k-1-layer", + "dinov2_vitb14_1layer": "dinov2-base-imagenet1k-1-layer", + "dinov2_vitl14_1layer": "dinov2-large-imagenet1k-1-layer", + "dinov2_vitg14_1layer": "dinov2-giant-imagenet1k-1-layer", + } + + name = model_name_to_hf_name[model_name] + model.push_to_hub(f"facebook/{name}") + processor.push_to_hub(f"facebook/{name}") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + # Required parameters + parser.add_argument( + "--model_name", + default="dinov2_vitb14", + type=str, + choices=[ + "dinov2_vits14", + "dinov2_vitb14", + "dinov2_vitl14", + "dinov2_vitg14", + "dinov2_vits14_1layer", + "dinov2_vitb14_1layer", + "dinov2_vitl14_1layer", + "dinov2_vitg14_1layer", + ], + help="Name of the model you'd like to convert.", + ) + parser.add_argument( + "--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model directory." + ) + parser.add_argument( + "--push_to_hub", action="store_true", help="Whether or not to push the converted model to the 🤗 hub." + ) + + args = parser.parse_args() + convert_dinov2_checkpoint(args.model_name, args.pytorch_dump_folder_path, args.push_to_hub) diff --git a/transformers/src/transformers/models/dinov2/modeling_dinov2.py b/transformers/src/transformers/models/dinov2/modeling_dinov2.py new file mode 100644 index 0000000000000000000000000000000000000000..6b98e3fa8d6523779c113a84e781a2ab752f3044 --- /dev/null +++ b/transformers/src/transformers/models/dinov2/modeling_dinov2.py @@ -0,0 +1,797 @@ +# coding=utf-8 +# Copyright 2023 Meta AI and The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""PyTorch DINOv2 model.""" + +import collections.abc +from typing import Callable, Optional, Union + +import torch +import torch.utils.checkpoint +from torch import nn +from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss + +from ...activations import ACT2FN +from ...modeling_layers import GradientCheckpointingLayer +from ...modeling_outputs import BackboneOutput, BaseModelOutput, BaseModelOutputWithPooling, ImageClassifierOutput +from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel +from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer +from ...utils import auto_docstring, logging, torch_int +from ...utils.backbone_utils import BackboneMixin +from .configuration_dinov2 import Dinov2Config + + +logger = logging.get_logger(__name__) + + +class Dinov2Embeddings(nn.Module): + """ + Construct the CLS token, mask token, position and patch embeddings. + """ + + def __init__(self, config: Dinov2Config) -> None: + super().__init__() + + self.cls_token = nn.Parameter(torch.randn(1, 1, config.hidden_size)) + if config.use_mask_token: + self.mask_token = nn.Parameter(torch.zeros(1, config.hidden_size)) + self.patch_embeddings = Dinov2PatchEmbeddings(config) + num_patches = self.patch_embeddings.num_patches + self.position_embeddings = nn.Parameter(torch.randn(1, num_patches + 1, config.hidden_size)) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + self.patch_size = config.patch_size + self.use_mask_token = config.use_mask_token + self.config = config + + def interpolate_pos_encoding(self, embeddings: torch.Tensor, height: int, width: int) -> torch.Tensor: + """ + This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution + images. This method is also adapted to support torch.jit tracing and interpolation at torch.float32 precision. + + Adapted from: + - https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and + - https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211 + """ + + num_patches = embeddings.shape[1] - 1 + num_positions = self.position_embeddings.shape[1] - 1 + + # always interpolate when tracing to ensure the exported model works for dynamic input shapes + if not torch.jit.is_tracing() and num_patches == num_positions and height == width: + return self.position_embeddings + + class_pos_embed = self.position_embeddings[:, :1] + patch_pos_embed = self.position_embeddings[:, 1:] + + dim = embeddings.shape[-1] + + new_height = height // self.patch_size + new_width = width // self.patch_size + + sqrt_num_positions = torch_int(num_positions**0.5) + patch_pos_embed = patch_pos_embed.reshape(1, sqrt_num_positions, sqrt_num_positions, dim) + patch_pos_embed = patch_pos_embed.permute(0, 3, 1, 2) + target_dtype = patch_pos_embed.dtype + patch_pos_embed = nn.functional.interpolate( + patch_pos_embed.to(torch.float32), + size=(new_height, new_width), + mode="bicubic", + align_corners=False, + ).to(dtype=target_dtype) + + patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).view(1, -1, dim) + + return torch.cat((class_pos_embed, patch_pos_embed), dim=1) + + def forward(self, pixel_values: torch.Tensor, bool_masked_pos: Optional[torch.Tensor] = None) -> torch.Tensor: + batch_size, _, height, width = pixel_values.shape + target_dtype = self.patch_embeddings.projection.weight.dtype + embeddings = self.patch_embeddings(pixel_values.to(dtype=target_dtype)) + + if bool_masked_pos is not None and self.use_mask_token: + embeddings = torch.where( + bool_masked_pos.unsqueeze(-1), self.mask_token.to(embeddings.dtype).unsqueeze(0), embeddings + ) + + # add the [CLS] token to the embedded patch tokens + cls_tokens = self.cls_token.expand(batch_size, -1, -1) + embeddings = torch.cat((cls_tokens, embeddings), dim=1) + + # add positional encoding to each token + embeddings = embeddings + self.interpolate_pos_encoding(embeddings, height, width) + + embeddings = self.dropout(embeddings) + + return embeddings + + +class Dinov2PatchEmbeddings(nn.Module): + """ + This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial + `hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a + Transformer. + """ + + def __init__(self, config): + super().__init__() + image_size, patch_size = config.image_size, config.patch_size + num_channels, hidden_size = config.num_channels, config.hidden_size + + image_size = image_size if isinstance(image_size, collections.abc.Iterable) else (image_size, image_size) + patch_size = patch_size if isinstance(patch_size, collections.abc.Iterable) else (patch_size, patch_size) + num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0]) + self.image_size = image_size + self.patch_size = patch_size + self.num_channels = num_channels + self.num_patches = num_patches + + self.projection = nn.Conv2d(num_channels, hidden_size, kernel_size=patch_size, stride=patch_size) + + def forward(self, pixel_values: torch.Tensor) -> torch.Tensor: + num_channels = pixel_values.shape[1] + if num_channels != self.num_channels: + raise ValueError( + "Make sure that the channel dimension of the pixel values match with the one set in the configuration." + f" Expected {self.num_channels} but got {num_channels}." + ) + embeddings = self.projection(pixel_values).flatten(2).transpose(1, 2) + return embeddings + + +# Copied from transformers.models.vit.modeling_vit.eager_attention_forward +def eager_attention_forward( + module: nn.Module, + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + attention_mask: Optional[torch.Tensor], + scaling: float, + dropout: float = 0.0, + **kwargs, +): + # Take the dot product between "query" and "key" to get the raw attention scores. + attn_weights = torch.matmul(query, key.transpose(-1, -2)) * scaling + + # Normalize the attention scores to probabilities. + attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype) + + # This is actually dropping out entire tokens to attend to, which might + # seem a bit unusual, but is taken from the original Transformer paper. + attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training) + + # Mask heads if we want to + if attention_mask is not None: + attn_weights = attn_weights * attention_mask + + attn_output = torch.matmul(attn_weights, value) + attn_output = attn_output.transpose(1, 2).contiguous() + + return attn_output, attn_weights + + +# Copied from transformers.models.vit.modeling_vit.ViTSelfAttention with ViT->Dinov2 +class Dinov2SelfAttention(nn.Module): + def __init__(self, config: Dinov2Config) -> None: + super().__init__() + if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"): + raise ValueError( + f"The hidden size {config.hidden_size} is not a multiple of the number of attention " + f"heads {config.num_attention_heads}." + ) + + self.config = config + self.num_attention_heads = config.num_attention_heads + self.attention_head_size = int(config.hidden_size / config.num_attention_heads) + self.all_head_size = self.num_attention_heads * self.attention_head_size + self.dropout_prob = config.attention_probs_dropout_prob + self.scaling = self.attention_head_size**-0.5 + self.is_causal = False + + self.query = nn.Linear(config.hidden_size, self.all_head_size, bias=config.qkv_bias) + self.key = nn.Linear(config.hidden_size, self.all_head_size, bias=config.qkv_bias) + self.value = nn.Linear(config.hidden_size, self.all_head_size, bias=config.qkv_bias) + + def transpose_for_scores(self, x: torch.Tensor) -> torch.Tensor: + new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size) + x = x.view(new_x_shape) + return x.permute(0, 2, 1, 3) + + def forward( + self, hidden_states, head_mask: Optional[torch.Tensor] = None, output_attentions: bool = False + ) -> Union[tuple[torch.Tensor, torch.Tensor], tuple[torch.Tensor]]: + key_layer = self.transpose_for_scores(self.key(hidden_states)) + value_layer = self.transpose_for_scores(self.value(hidden_states)) + query_layer = self.transpose_for_scores(self.query(hidden_states)) + + attention_interface: Callable = eager_attention_forward + if self.config._attn_implementation != "eager": + if self.config._attn_implementation == "sdpa" and output_attentions: + logger.warning_once( + "`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to " + 'eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.' + ) + else: + attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation] + + context_layer, attention_probs = attention_interface( + self, + query_layer, + key_layer, + value_layer, + head_mask, + is_causal=self.is_causal, + scaling=self.scaling, + dropout=0.0 if not self.training else self.dropout_prob, + ) + + new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,) + context_layer = context_layer.reshape(new_context_layer_shape) + + outputs = (context_layer, attention_probs) if output_attentions else (context_layer,) + + return outputs + + +# Copied from transformers.models.vit.modeling_vit.ViTSelfOutput with ViT->Dinov2 +class Dinov2SelfOutput(nn.Module): + """ + The residual connection is defined in Dinov2Layer instead of here (as is the case with other models), due to the + layernorm applied before each block. + """ + + def __init__(self, config: Dinov2Config) -> None: + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.hidden_size) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor: + hidden_states = self.dense(hidden_states) + hidden_states = self.dropout(hidden_states) + + return hidden_states + + +# Copied from transformers.models.vit.modeling_vit.ViTAttention with ViT->Dinov2 +class Dinov2Attention(nn.Module): + def __init__(self, config: Dinov2Config) -> None: + super().__init__() + self.attention = Dinov2SelfAttention(config) + self.output = Dinov2SelfOutput(config) + self.pruned_heads = set() + + def prune_heads(self, heads: set[int]) -> None: + if len(heads) == 0: + return + heads, index = find_pruneable_heads_and_indices( + heads, self.attention.num_attention_heads, self.attention.attention_head_size, self.pruned_heads + ) + + # Prune linear layers + self.attention.query = prune_linear_layer(self.attention.query, index) + self.attention.key = prune_linear_layer(self.attention.key, index) + self.attention.value = prune_linear_layer(self.attention.value, index) + self.output.dense = prune_linear_layer(self.output.dense, index, dim=1) + + # Update hyper params and store pruned heads + self.attention.num_attention_heads = self.attention.num_attention_heads - len(heads) + self.attention.all_head_size = self.attention.attention_head_size * self.attention.num_attention_heads + self.pruned_heads = self.pruned_heads.union(heads) + + def forward( + self, + hidden_states: torch.Tensor, + head_mask: Optional[torch.Tensor] = None, + output_attentions: bool = False, + ) -> Union[tuple[torch.Tensor, torch.Tensor], tuple[torch.Tensor]]: + self_outputs = self.attention(hidden_states, head_mask, output_attentions) + + attention_output = self.output(self_outputs[0], hidden_states) + + outputs = (attention_output,) + self_outputs[1:] # add attentions if we output them + return outputs + + +class Dinov2LayerScale(nn.Module): + def __init__(self, config) -> None: + super().__init__() + self.lambda1 = nn.Parameter(config.layerscale_value * torch.ones(config.hidden_size)) + + def forward(self, hidden_state: torch.Tensor) -> torch.Tensor: + return hidden_state * self.lambda1 + + +# Copied from transformers.models.beit.modeling_beit.drop_path +def drop_path(input: torch.Tensor, drop_prob: float = 0.0, training: bool = False) -> torch.Tensor: + """ + Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks). + + Comment by Ross Wightman: This is the same as the DropConnect impl I created for EfficientNet, etc networks, + however, the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper... + See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for changing the + layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use 'survival rate' as the + argument. + """ + if drop_prob == 0.0 or not training: + return input + keep_prob = 1 - drop_prob + shape = (input.shape[0],) + (1,) * (input.ndim - 1) # work with diff dim tensors, not just 2D ConvNets + random_tensor = keep_prob + torch.rand(shape, dtype=input.dtype, device=input.device) + random_tensor.floor_() # binarize + output = input.div(keep_prob) * random_tensor + return output + + +# Copied from transformers.models.beit.modeling_beit.BeitDropPath +class Dinov2DropPath(nn.Module): + """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).""" + + def __init__(self, drop_prob: Optional[float] = None) -> None: + super().__init__() + self.drop_prob = drop_prob + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + return drop_path(hidden_states, self.drop_prob, self.training) + + def extra_repr(self) -> str: + return f"p={self.drop_prob}" + + +class Dinov2MLP(nn.Module): + def __init__(self, config) -> None: + super().__init__() + in_features = out_features = config.hidden_size + hidden_features = int(config.hidden_size * config.mlp_ratio) + self.fc1 = nn.Linear(in_features, hidden_features, bias=True) + if isinstance(config.hidden_act, str): + self.activation = ACT2FN[config.hidden_act] + else: + self.activation = config.hidden_act + self.fc2 = nn.Linear(hidden_features, out_features, bias=True) + + def forward(self, hidden_state: torch.Tensor) -> torch.Tensor: + hidden_state = self.fc1(hidden_state) + hidden_state = self.activation(hidden_state) + hidden_state = self.fc2(hidden_state) + return hidden_state + + +class Dinov2SwiGLUFFN(nn.Module): + def __init__(self, config) -> None: + super().__init__() + in_features = out_features = config.hidden_size + hidden_features = int(config.hidden_size * config.mlp_ratio) + hidden_features = (int(hidden_features * 2 / 3) + 7) // 8 * 8 + + self.weights_in = nn.Linear(in_features, 2 * hidden_features, bias=True) + self.weights_out = nn.Linear(hidden_features, out_features, bias=True) + + def forward(self, hidden_state: torch.Tensor) -> torch.Tensor: + hidden_state = self.weights_in(hidden_state) + x1, x2 = hidden_state.chunk(2, dim=-1) + hidden = nn.functional.silu(x1) * x2 + return self.weights_out(hidden) + + +class Dinov2Layer(GradientCheckpointingLayer): + """This corresponds to the Block class in the original implementation.""" + + def __init__(self, config: Dinov2Config) -> None: + super().__init__() + + self.norm1 = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.attention = Dinov2Attention(config) + self.layer_scale1 = Dinov2LayerScale(config) + self.drop_path = Dinov2DropPath(config.drop_path_rate) if config.drop_path_rate > 0.0 else nn.Identity() + + self.norm2 = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + + if config.use_swiglu_ffn: + self.mlp = Dinov2SwiGLUFFN(config) + else: + self.mlp = Dinov2MLP(config) + self.layer_scale2 = Dinov2LayerScale(config) + + def forward( + self, + hidden_states: torch.Tensor, + head_mask: Optional[torch.Tensor] = None, + output_attentions: bool = False, + ) -> Union[tuple[torch.Tensor, torch.Tensor], tuple[torch.Tensor]]: + self_attention_outputs = self.attention( + self.norm1(hidden_states), # in Dinov2, layernorm is applied before self-attention + head_mask, + output_attentions=output_attentions, + ) + attention_output = self_attention_outputs[0] + + attention_output = self.layer_scale1(attention_output) + outputs = self_attention_outputs[1:] # add self attentions if we output attention weights + + # first residual connection + hidden_states = self.drop_path(attention_output) + hidden_states + + # in Dinov2, layernorm is also applied after self-attention + layer_output = self.norm2(hidden_states) + layer_output = self.mlp(layer_output) + layer_output = self.layer_scale2(layer_output) + + # second residual connection + layer_output = self.drop_path(layer_output) + hidden_states + + outputs = (layer_output,) + outputs + + return outputs + + +# Copied from transformers.models.vit.modeling_vit.ViTEncoder with ViT->Dinov2 +class Dinov2Encoder(nn.Module): + def __init__(self, config: Dinov2Config) -> None: + super().__init__() + self.config = config + self.layer = nn.ModuleList([Dinov2Layer(config) for _ in range(config.num_hidden_layers)]) + self.gradient_checkpointing = False + + def forward( + self, + hidden_states: torch.Tensor, + head_mask: Optional[torch.Tensor] = None, + output_attentions: bool = False, + output_hidden_states: bool = False, + return_dict: bool = True, + ) -> Union[tuple, BaseModelOutput]: + all_hidden_states = () if output_hidden_states else None + all_self_attentions = () if output_attentions else None + + for i, layer_module in enumerate(self.layer): + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + layer_head_mask = head_mask[i] if head_mask is not None else None + + layer_outputs = layer_module(hidden_states, layer_head_mask, output_attentions) + + hidden_states = layer_outputs[0] + + if output_attentions: + all_self_attentions = all_self_attentions + (layer_outputs[1],) + + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + if not return_dict: + return tuple(v for v in [hidden_states, all_hidden_states, all_self_attentions] if v is not None) + return BaseModelOutput( + last_hidden_state=hidden_states, + hidden_states=all_hidden_states, + attentions=all_self_attentions, + ) + + +@auto_docstring +class Dinov2PreTrainedModel(PreTrainedModel): + config_class = Dinov2Config + base_model_prefix = "dinov2" + main_input_name = "pixel_values" + supports_gradient_checkpointing = True + _no_split_modules = ["Dinov2Layer"] + _supports_sdpa = True + _supports_flash_attn_2 = True + _supports_flash_attn_3 = True + _supports_flex_attn = True + _supports_attention_backend = True + + def _init_weights(self, module: Union[nn.Linear, nn.Conv2d, nn.LayerNorm]) -> None: + """Initialize the weights""" + if isinstance(module, (nn.Linear, nn.Conv2d)): + # Upcast the input in `fp32` and cast it back to desired `dtype` to avoid + # `trunc_normal_cpu` not implemented in `half` issues + module.weight.data = nn.init.trunc_normal_( + module.weight.data.to(torch.float32), mean=0.0, std=self.config.initializer_range + ).to(module.weight.dtype) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.LayerNorm): + module.bias.data.zero_() + module.weight.data.fill_(1.0) + elif isinstance(module, Dinov2Embeddings): + module.position_embeddings.data = nn.init.trunc_normal_( + module.position_embeddings.data.to(torch.float32), + mean=0.0, + std=self.config.initializer_range, + ).to(module.position_embeddings.dtype) + + module.cls_token.data = nn.init.trunc_normal_( + module.cls_token.data.to(torch.float32), + mean=0.0, + std=self.config.initializer_range, + ).to(module.cls_token.dtype) + + if self.config.use_mask_token: + module.mask_token.data.zero_() + elif isinstance(module, Dinov2LayerScale): + module.lambda1.data.fill_(self.config.layerscale_value) + + +@auto_docstring +class Dinov2Model(Dinov2PreTrainedModel): + def __init__(self, config: Dinov2Config): + super().__init__(config) + self.config = config + + self.embeddings = Dinov2Embeddings(config) + self.encoder = Dinov2Encoder(config) + + self.layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self) -> Dinov2PatchEmbeddings: + return self.embeddings.patch_embeddings + + def _prune_heads(self, heads_to_prune: dict[int, list[int]]) -> None: + """ + Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base + class PreTrainedModel + """ + for layer, heads in heads_to_prune.items(): + self.encoder.layer[layer].attention.prune_heads(heads) + + @auto_docstring + def forward( + self, + pixel_values: Optional[torch.Tensor] = None, + bool_masked_pos: Optional[torch.Tensor] = None, + head_mask: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[tuple, BaseModelOutputWithPooling]: + r""" + bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, sequence_length)`): + Boolean masked positions. Indicates which patches are masked (1) and which aren't (0). Only relevant for + pre-training. + """ + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if pixel_values is None: + raise ValueError("You have to specify pixel_values") + + # Prepare head mask if needed + # 1.0 in head_mask indicate we keep the head + # attention_probs has shape bsz x n_heads x N x N + # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads] + # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length] + head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers) + + embedding_output = self.embeddings(pixel_values, bool_masked_pos=bool_masked_pos) + + encoder_outputs = self.encoder( + embedding_output, + head_mask=head_mask, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + sequence_output = encoder_outputs[0] + sequence_output = self.layernorm(sequence_output) + pooled_output = sequence_output[:, 0, :] + + if not return_dict: + head_outputs = (sequence_output, pooled_output) + return head_outputs + encoder_outputs[1:] + + return BaseModelOutputWithPooling( + last_hidden_state=sequence_output, + pooler_output=pooled_output, + hidden_states=encoder_outputs.hidden_states, + attentions=encoder_outputs.attentions, + ) + + +@auto_docstring( + custom_intro=""" + Dinov2 Model transformer with an image classification head on top (a linear layer on top of the final hidden state + of the [CLS] token) e.g. for ImageNet. + """ +) +class Dinov2ForImageClassification(Dinov2PreTrainedModel): + def __init__(self, config: Dinov2Config) -> None: + super().__init__(config) + + self.num_labels = config.num_labels + self.dinov2 = Dinov2Model(config) + + # Classifier head + self.classifier = ( + nn.Linear(config.hidden_size * 2, config.num_labels) if config.num_labels > 0 else nn.Identity() + ) + + # Initialize weights and apply final processing + self.post_init() + + @auto_docstring + def forward( + self, + pixel_values: Optional[torch.Tensor] = None, + head_mask: Optional[torch.Tensor] = None, + labels: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[tuple, ImageClassifierOutput]: + r""" + labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels for computing the image classification/regression loss. Indices should be in `[0, ..., + config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If + `config.num_labels > 1` a classification loss is computed (Cross-Entropy). + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + outputs = self.dinov2( + pixel_values, + head_mask=head_mask, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + sequence_output = outputs[0] # batch_size, sequence_length, hidden_size + + cls_token = sequence_output[:, 0] + patch_tokens = sequence_output[:, 1:] + + linear_input = torch.cat([cls_token, patch_tokens.mean(dim=1)], dim=1) + + logits = self.classifier(linear_input) + + loss = None + if labels is not None: + # move labels to correct device to enable model parallelism + labels = labels.to(logits.device) + if self.config.problem_type is None: + if self.num_labels == 1: + self.config.problem_type = "regression" + elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int): + self.config.problem_type = "single_label_classification" + else: + self.config.problem_type = "multi_label_classification" + + if self.config.problem_type == "regression": + loss_fct = MSELoss() + if self.num_labels == 1: + loss = loss_fct(logits.squeeze(), labels.squeeze()) + else: + loss = loss_fct(logits, labels) + elif self.config.problem_type == "single_label_classification": + loss_fct = CrossEntropyLoss() + loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) + elif self.config.problem_type == "multi_label_classification": + loss_fct = BCEWithLogitsLoss() + loss = loss_fct(logits, labels) + + if not return_dict: + output = (logits,) + outputs[2:] + return ((loss,) + output) if loss is not None else output + + return ImageClassifierOutput( + loss=loss, + logits=logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +@auto_docstring( + custom_intro=""" + Dinov2 backbone, to be used with frameworks like DETR and MaskFormer. + """ +) +class Dinov2Backbone(Dinov2PreTrainedModel, BackboneMixin): + def __init__(self, config): + super().__init__(config) + super()._init_backbone(config) + + self.num_features = [config.hidden_size for _ in range(config.num_hidden_layers + 1)] + self.embeddings = Dinov2Embeddings(config) + self.encoder = Dinov2Encoder(config) + + self.layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self) -> Dinov2PatchEmbeddings: + return self.embeddings.patch_embeddings + + @auto_docstring + def forward( + self, + pixel_values: torch.Tensor, + output_hidden_states: Optional[bool] = None, + output_attentions: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> BackboneOutput: + r""" + Examples: + + ```python + >>> from transformers import AutoImageProcessor, AutoBackbone + >>> import torch + >>> from PIL import Image + >>> import requests + + >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" + >>> image = Image.open(requests.get(url, stream=True).raw) + + >>> processor = AutoImageProcessor.from_pretrained("facebook/dinov2-base") + >>> model = AutoBackbone.from_pretrained( + ... "facebook/dinov2-base", out_features=["stage2", "stage5", "stage8", "stage11"] + ... ) + + >>> inputs = processor(image, return_tensors="pt") + + >>> outputs = model(**inputs) + >>> feature_maps = outputs.feature_maps + >>> list(feature_maps[-1].shape) + [1, 768, 16, 16] + ```""" + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + + embedding_output = self.embeddings(pixel_values) + + outputs = self.encoder( + embedding_output, output_hidden_states=True, output_attentions=output_attentions, return_dict=return_dict + ) + + hidden_states = outputs.hidden_states if return_dict else outputs[1] + + feature_maps = () + for stage, hidden_state in zip(self.stage_names, hidden_states): + if stage in self.out_features: + if self.config.apply_layernorm: + hidden_state = self.layernorm(hidden_state) + if self.config.reshape_hidden_states: + hidden_state = hidden_state[:, 1:] + # this was actually a bug in the original implementation that we copied here, + # cause normally the order is height, width + batch_size, _, height, width = pixel_values.shape + patch_size = self.config.patch_size + hidden_state = hidden_state.reshape(batch_size, height // patch_size, width // patch_size, -1) + hidden_state = hidden_state.permute(0, 3, 1, 2).contiguous() + feature_maps += (hidden_state,) + + if not return_dict: + if output_hidden_states: + output = (feature_maps,) + outputs[1:] + else: + output = (feature_maps,) + outputs[2:] + return output + + return BackboneOutput( + feature_maps=feature_maps, + hidden_states=outputs.hidden_states if output_hidden_states else None, + attentions=outputs.attentions if output_attentions else None, + ) + + +__all__ = ["Dinov2ForImageClassification", "Dinov2Model", "Dinov2PreTrainedModel", "Dinov2Backbone"] diff --git a/transformers/src/transformers/models/dinov2/modeling_flax_dinov2.py b/transformers/src/transformers/models/dinov2/modeling_flax_dinov2.py new file mode 100644 index 0000000000000000000000000000000000000000..b9ea2eaa3ebc5ac5db192dde220dfe114ef38235 --- /dev/null +++ b/transformers/src/transformers/models/dinov2/modeling_flax_dinov2.py @@ -0,0 +1,801 @@ +# coding=utf-8 +# Copyright 2023 Meta AI and The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Flax DINOv2 model.""" + +import collections.abc +import math +from typing import Optional + +import flax.linen as nn +import jax +import jax.numpy as jnp +from flax.core.frozen_dict import FrozenDict, freeze, unfreeze +from flax.linen.attention import dot_product_attention_weights +from flax.traverse_util import flatten_dict, unflatten_dict + +from ...modeling_flax_outputs import FlaxBaseModelOutput, FlaxBaseModelOutputWithPooling, FlaxSequenceClassifierOutput +from ...modeling_flax_utils import ( + ACT2FN, + FlaxPreTrainedModel, + append_replace_return_docstrings, + overwrite_call_docstring, +) +from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward +from .configuration_dinov2 import Dinov2Config + + +DINOV2_START_DOCSTRING = r""" + + This model inherits from [`FlaxPreTrainedModel`]. Check the superclass documentation for the generic methods the + library implements for all its model (such as downloading, saving and converting weights from PyTorch models) + + This model is also a + [flax.linen.Module](https://flax.readthedocs.io/en/latest/api_reference/flax.linen/module.html) subclass. Use it as + a regular Flax linen Module and refer to the Flax documentation for all matter related to general usage and + behavior. + + Finally, this model supports inherent JAX features such as: + + - [Just-In-Time (JIT) compilation](https://jax.readthedocs.io/en/latest/jax.html#just-in-time-compilation-jit) + - [Automatic Differentiation](https://jax.readthedocs.io/en/latest/jax.html#automatic-differentiation) + - [Vectorization](https://jax.readthedocs.io/en/latest/jax.html#vectorization-vmap) + - [Parallelization](https://jax.readthedocs.io/en/latest/jax.html#parallelization-pmap) + + Parameters: + config ([`Dinov2Config`]): Model configuration class with all the parameters of the model. + Initializing with a config file does not load the weights associated with the model, only the + configuration. Check out the [`~FlaxPreTrainedModel.from_pretrained`] method to load the model weights. + dtype (`jax.numpy.dtype`, *optional*, defaults to `jax.numpy.float32`): + The data type of the computation. Can be one of `jax.numpy.float32`, `jax.numpy.float16` (on GPUs) and + `jax.numpy.bfloat16` (on TPUs). + + This can be used to enable mixed-precision training or half-precision inference on GPUs or TPUs. If + specified all the computation will be performed with the given `dtype`. + + **Note that this only specifies the dtype of the computation and does not influence the dtype of model + parameters.** + + If you wish to change the dtype of the model parameters, see [`~FlaxPreTrainedModel.to_fp16`] and + [`~FlaxPreTrainedModel.to_bf16`]. +""" + +DINOV2_INPUTS_DOCSTRING = r""" + Args: + pixel_values (`numpy.ndarray` of shape `(batch_size, num_channels, height, width)`): + Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See [`Dinov2ImageProcessor.__call__`] + for details. + + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned + tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. +""" + + +class FlaxDinov2PatchEmbeddings(nn.Module): + config: Dinov2Config + dtype: jnp.dtype = jnp.float32 # the dtype of the computation + + def setup(self): + image_size = self.config.image_size + patch_size = self.config.patch_size + image_size = image_size if isinstance(image_size, collections.abc.Iterable) else (image_size, image_size) + patch_size = patch_size if isinstance(patch_size, collections.abc.Iterable) else (patch_size, patch_size) + num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0]) + + self.num_patches = num_patches + self.num_channels = self.config.num_channels + self.projection = nn.Conv( + self.config.hidden_size, + kernel_size=patch_size, + strides=patch_size, + padding="VALID", + dtype=self.dtype, + kernel_init=jax.nn.initializers.variance_scaling( + self.config.initializer_range**2, "fan_in", "truncated_normal" + ), + ) + + # Copied from transformers.models.vit.modeling_flax_vit.FlaxViTPatchEmbeddings.__call__ + def __call__(self, pixel_values): + num_channels = pixel_values.shape[-1] + if num_channels != self.num_channels: + raise ValueError( + "Make sure that the channel dimension of the pixel values match with the one set in the configuration." + ) + embeddings = self.projection(pixel_values) + batch_size, _, _, channels = embeddings.shape + return jnp.reshape(embeddings, (batch_size, -1, channels)) + + +class FlaxDinov2Embeddings(nn.Module): + """Construct the CLS token, position and patch embeddings.""" + + config: Dinov2Config + dtype: jnp.dtype = jnp.float32 # the dtype of the computation + + def setup(self): + self.cls_token = self.param( + "cls_token", + jax.nn.initializers.variance_scaling(self.config.initializer_range**2, "fan_in", "truncated_normal"), + (1, 1, self.config.hidden_size), + ) + if self.config.use_mask_token: + self.mask_token = self.param( + "mask_token", + jax.nn.initializers.variance_scaling(self.config.initializer_range**2, "fan_in", "truncated_normal"), + (1, self.config.hidden_size), + ) + self.patch_embeddings = FlaxDinov2PatchEmbeddings(self.config, dtype=self.dtype) + num_patches = self.patch_embeddings.num_patches + self.position_embeddings = self.param( + "position_embeddings", + jax.nn.initializers.variance_scaling(self.config.initializer_range**2, "fan_in", "truncated_normal"), + (1, num_patches + 1, self.config.hidden_size), + ) + self.dropout = nn.Dropout(rate=self.config.hidden_dropout_prob) + + def interpolate_pos_encoding(self, config, hidden_states, height, width, position_embeddings): + num_patches = hidden_states.shape[1] - 1 + num_positions = position_embeddings.shape[1] - 1 + if num_patches == num_positions and height == width: + return position_embeddings + class_pos_embed = position_embeddings[:, 0] + patch_pos_embed = position_embeddings[:, 1:] + dim = hidden_states.shape[-1] + + h = height // config.patch_size + w = width // config.patch_size + height, width = h + 0.1, w + 0.1 + + patch_pos_embed = patch_pos_embed.reshape( + (1, int(math.sqrt(num_positions)), int(math.sqrt(num_positions)), dim) + ) + patch_pos_embed = jnp.transpose(patch_pos_embed, (0, 3, 1, 2)) + target_dtype = patch_pos_embed.dtype + new_height_ratio = jnp.float32(height / math.sqrt(num_positions)) + new_width_ratio = jnp.float32(width / math.sqrt(num_positions)) + + scale = jnp.array([new_height_ratio, new_width_ratio], dtype=jnp.float32) + translation = jnp.array([0.0, 0.0], dtype=jnp.float32) + + patch_pos_embed = jax.image.scale_and_translate( + patch_pos_embed.astype(jnp.float32), + shape=(patch_pos_embed.shape[0], patch_pos_embed.shape[1], h, w), + spatial_dims=(2, 3), + scale=scale, + translation=translation, + method="bicubic", + antialias=False, + ) + patch_pos_embed = patch_pos_embed.astype(target_dtype) + patch_pos_embed = jnp.transpose(patch_pos_embed, (0, 2, 3, 1)).reshape((position_embeddings.shape[0], -1, dim)) + patch_pos_embed_expanded = jnp.tile(patch_pos_embed, (hidden_states.shape[0], 1, 1)) + class_pos_embed_expanded = jnp.tile(class_pos_embed, (hidden_states.shape[0], 1, 1)) + + return jnp.concatenate((class_pos_embed_expanded, patch_pos_embed_expanded), axis=1) + + def __call__(self, pixel_values, deterministic=True): + batch_size = pixel_values.shape[0] + target_dtype = self.patch_embeddings.projection.dtype + height, width = pixel_values.shape[1], pixel_values.shape[2] + + embeddings = self.patch_embeddings(pixel_values.astype(target_dtype)) + + cls_tokens = jnp.broadcast_to(self.cls_token, (batch_size, 1, self.config.hidden_size)) + embeddings = jnp.concatenate((cls_tokens, embeddings), axis=1) + + embeddings = embeddings + self.interpolate_pos_encoding( + self.config, embeddings, height, width, self.position_embeddings + ) + + embeddings = self.dropout(embeddings, deterministic=deterministic) + return embeddings + + +# Copied from transformers.models.vit.modeling_flax_vit.FlaxViTSelfAttention with ViT->Dinov2 +class FlaxDinov2SelfAttention(nn.Module): + config: Dinov2Config + dtype: jnp.dtype = jnp.float32 # the dtype of the computation + + def setup(self): + if self.config.hidden_size % self.config.num_attention_heads != 0: + raise ValueError( + "`config.hidden_size`: {self.config.hidden_size} has to be a multiple of `config.num_attention_heads`:" + " {self.config.num_attention_heads}" + ) + + self.query = nn.Dense( + self.config.hidden_size, + dtype=self.dtype, + kernel_init=jax.nn.initializers.variance_scaling( + self.config.initializer_range**2, mode="fan_in", distribution="truncated_normal" + ), + use_bias=self.config.qkv_bias, + ) + self.key = nn.Dense( + self.config.hidden_size, + dtype=self.dtype, + kernel_init=jax.nn.initializers.variance_scaling( + self.config.initializer_range**2, mode="fan_in", distribution="truncated_normal" + ), + use_bias=self.config.qkv_bias, + ) + self.value = nn.Dense( + self.config.hidden_size, + dtype=self.dtype, + kernel_init=jax.nn.initializers.variance_scaling( + self.config.initializer_range**2, mode="fan_in", distribution="truncated_normal" + ), + use_bias=self.config.qkv_bias, + ) + + def __call__(self, hidden_states, deterministic: bool = True, output_attentions: bool = False): + head_dim = self.config.hidden_size // self.config.num_attention_heads + + query_states = self.query(hidden_states).reshape( + hidden_states.shape[:2] + (self.config.num_attention_heads, head_dim) + ) + value_states = self.value(hidden_states).reshape( + hidden_states.shape[:2] + (self.config.num_attention_heads, head_dim) + ) + key_states = self.key(hidden_states).reshape( + hidden_states.shape[:2] + (self.config.num_attention_heads, head_dim) + ) + + dropout_rng = None + if not deterministic and self.config.attention_probs_dropout_prob > 0.0: + dropout_rng = self.make_rng("dropout") + + attn_weights = dot_product_attention_weights( + query_states, + key_states, + dropout_rng=dropout_rng, + dropout_rate=self.config.attention_probs_dropout_prob, + broadcast_dropout=True, + deterministic=deterministic, + dtype=self.dtype, + precision=None, + ) + + attn_output = jnp.einsum("...hqk,...khd->...qhd", attn_weights, value_states) + attn_output = attn_output.reshape(attn_output.shape[:2] + (-1,)) + + outputs = (attn_output, attn_weights) if output_attentions else (attn_output,) + return outputs + + +# Copied from transformers.models.vit.modeling_flax_vit.FlaxViTSelfOutput with ViT->Dinov2 +class FlaxDinov2SelfOutput(nn.Module): + config: Dinov2Config + dtype: jnp.dtype = jnp.float32 # the dtype of the computation + + def setup(self): + self.dense = nn.Dense( + self.config.hidden_size, + kernel_init=jax.nn.initializers.variance_scaling( + self.config.initializer_range**2, "fan_in", "truncated_normal" + ), + dtype=self.dtype, + ) + self.dropout = nn.Dropout(rate=self.config.hidden_dropout_prob) + + def __call__(self, hidden_states, input_tensor, deterministic: bool = True): + hidden_states = self.dense(hidden_states) + hidden_states = self.dropout(hidden_states, deterministic=deterministic) + return hidden_states + + +# Copied from transformers.models.vit.modeling_flax_vit.FlaxViTAttention with ViT->Dinov2 +class FlaxDinov2Attention(nn.Module): + config: Dinov2Config + dtype: jnp.dtype = jnp.float32 + + def setup(self): + self.attention = FlaxDinov2SelfAttention(self.config, dtype=self.dtype) + self.output = FlaxDinov2SelfOutput(self.config, dtype=self.dtype) + + def __call__(self, hidden_states, deterministic=True, output_attentions: bool = False): + attn_outputs = self.attention(hidden_states, deterministic=deterministic, output_attentions=output_attentions) + attn_output = attn_outputs[0] + hidden_states = self.output(attn_output, hidden_states, deterministic=deterministic) + + outputs = (hidden_states,) + + if output_attentions: + outputs += (attn_outputs[1],) + + return outputs + + +def ones_with_scale(key, shape, scale, dtype=jnp.float32): + return jnp.ones(shape, dtype) * scale + + +class FlaxDinov2LayerScale(nn.Module): + config: Dinov2Config + dtype: jnp.dtype = jnp.float32 # the dtype of the computation + + def setup(self): + self.lambda1 = self.config.layerscale_value * self.param( + "lambda1", + jax.nn.initializers.ones, + (self.config.hidden_size,), + ) + self.lambda1 = self.lambda1 * self.config.layerscale_value + + def __call__(self, hidden_states): + return self.lambda1 * hidden_states + + +# Copied from transformers.models.beit.modeling_flax_beit.FlaxBeitDropPath with Beit -> Dinov2 +class FlaxDinov2DropPath(nn.Module): + """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).""" + + rate: float + + @nn.module.compact + def __call__(self, inputs, deterministic: Optional[bool] = True): + if self.rate == 0.0: + return inputs + keep_prob = 1.0 - self.rate + if deterministic: + return inputs + else: + shape = (inputs.shape[0],) + (1,) * (inputs.ndim - 1) # work with diff dim tensors, not just 2D ConvNets + rng = self.make_rng("droppath") + random_tensor = keep_prob + jax.random.uniform(rng, shape=shape, dtype=inputs.dtype) + binary_tensor = jnp.floor(random_tensor) + output = inputs / keep_prob * binary_tensor + return output + + +class FlaxDinov2MLP(nn.Module): + config: Dinov2Config + dtype: jnp.dtype = jnp.float32 # the dtype of the computation + + def setup(self): + self.fc1 = nn.Dense( + self.config.hidden_size * self.config.mlp_ratio, + kernel_init=jax.nn.initializers.variance_scaling( + self.config.initializer_range**2, "fan_in", "truncated_normal" + ), + dtype=self.dtype, + ) + self.fc2 = nn.Dense( + self.config.hidden_size, + kernel_init=jax.nn.initializers.variance_scaling( + self.config.initializer_range**2, "fan_in", "truncated_normal" + ), + dtype=self.dtype, + ) + if isinstance(self.config.hidden_act, str): + self.act = ACT2FN[self.config.hidden_act] + else: + self.act = self.config.hidden_act + + def __call__(self, hidden_states): + hidden_states = self.fc1(hidden_states) + hidden_states = self.act(hidden_states) + hidden_states = self.fc2(hidden_states) + return hidden_states + + +class FlaxDinov2SwiGLUFFN(nn.Module): + config: Dinov2Config + dtype: jnp.dtype = jnp.float32 # the dtype of the computation + + def setup(self): + hidden_features = int(self.config.hidden_size * self.config.mlp_ratio) + hidden_features = (int(hidden_features * 2 / 3) + 7) // 8 * 8 + + self.weights_in = nn.Dense( + 2 * hidden_features, + kernel_init=jax.nn.initializers.variance_scaling( + self.config.initializer_range**2, "fan_in", "truncated_normal" + ), + dtype=self.dtype, + ) + self.weights_out = nn.Dense( + self.config.hidden_size, + kernel_init=jax.nn.initializers.variance_scaling( + self.config.initializer_range**2, "fan_in", "truncated_normal" + ), + dtype=self.dtype, + ) + + def __call__(self, hidden_states): + hidden_states = self.weights_in(hidden_states) + x1, x2 = jnp.split(hidden_states, 2, axis=-1) + hidden = nn.silu(x1) * x2 + return self.weights_out(hidden) + + +class FlaxDinov2Layer(nn.Module): + config: Dinov2Config + dtype: jnp.dtype = jnp.float32 # the dtype of the computation + + def setup(self): + self.norm1 = nn.LayerNorm(epsilon=self.config.layer_norm_eps, dtype=self.dtype) + self.attention = FlaxDinov2Attention(self.config, dtype=self.dtype) + self.layer_scale1 = FlaxDinov2LayerScale(self.config, dtype=self.dtype) + self.drop_path = FlaxDinov2DropPath(self.config.drop_path_rate) + self.norm2 = nn.LayerNorm(epsilon=self.config.layer_norm_eps, dtype=self.dtype) + + if self.config.use_swiglu_ffn: + self.mlp = FlaxDinov2SwiGLUFFN(self.config, dtype=self.dtype) + else: + self.mlp = FlaxDinov2MLP(self.config, dtype=self.dtype) + + self.layer_scale2 = FlaxDinov2LayerScale(self.config, dtype=self.dtype) + + def __call__(self, hidden_states, deterministic: bool = True, output_attentions: bool = False): + self_attention_outputs = self.attention( + self.norm1(hidden_states), # in Dinov2, layernorm is applied before self-attention + deterministic=deterministic, + output_attentions=output_attentions, + ) + + attention_output = self_attention_outputs[0] + + attention_output = self.layer_scale1(attention_output) + + outputs = self_attention_outputs[1:] + + # first residual connection + hidden_states = self.drop_path(attention_output) + hidden_states + + # in Dinov2, layernorm is also applied after self-attention + layer_output = self.norm2(hidden_states) + layer_output = self.mlp(layer_output) + layer_output = self.layer_scale2(layer_output) + + # second residual connection + layer_output = self.drop_path(layer_output) + hidden_states + + outputs = (layer_output,) + outputs + + return outputs + + +# Copied from transformers.models.vit.modeling_flax_vit.FlaxViTLayerCollection with ViT->Dinov2 +class FlaxDinov2LayerCollection(nn.Module): + config: Dinov2Config + dtype: jnp.dtype = jnp.float32 # the dtype of the computation + + def setup(self): + self.layers = [ + FlaxDinov2Layer(self.config, name=str(i), dtype=self.dtype) for i in range(self.config.num_hidden_layers) + ] + + def __call__( + self, + hidden_states, + deterministic: bool = True, + output_attentions: bool = False, + output_hidden_states: bool = False, + return_dict: bool = True, + ): + all_attentions = () if output_attentions else None + all_hidden_states = () if output_hidden_states else None + + for i, layer in enumerate(self.layers): + if output_hidden_states: + all_hidden_states += (hidden_states,) + + layer_outputs = layer(hidden_states, deterministic=deterministic, output_attentions=output_attentions) + + hidden_states = layer_outputs[0] + + if output_attentions: + all_attentions += (layer_outputs[1],) + + if output_hidden_states: + all_hidden_states += (hidden_states,) + + outputs = (hidden_states,) + if not return_dict: + return tuple(v for v in outputs if v is not None) + + return FlaxBaseModelOutput( + last_hidden_state=hidden_states, hidden_states=all_hidden_states, attentions=all_attentions + ) + + +# Copied from transformers.models.vit.modeling_flax_vit.FlaxViTEncoder with ViT->Dinov2 +class FlaxDinov2Encoder(nn.Module): + config: Dinov2Config + dtype: jnp.dtype = jnp.float32 # the dtype of the computation + + def setup(self): + self.layer = FlaxDinov2LayerCollection(self.config, dtype=self.dtype) + + def __call__( + self, + hidden_states, + deterministic: bool = True, + output_attentions: bool = False, + output_hidden_states: bool = False, + return_dict: bool = True, + ): + return self.layer( + hidden_states, + deterministic=deterministic, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + +class FlaxDinov2PreTrainedModel(FlaxPreTrainedModel): + """ + An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained + models. + """ + + config_class = Dinov2Config + base_model_prefix = "dinov2" + main_input_name = "pixel_values" + module_class: nn.Module = None + + def __init__( + self, + config: Dinov2Config, + input_shape=None, + seed: int = 0, + dtype: jnp.dtype = jnp.float32, + _do_init: bool = True, + **kwargs, + ): + module = self.module_class(config=config, dtype=dtype, **kwargs) + if input_shape is None: + input_shape = (1, config.image_size, config.image_size, config.num_channels) + super().__init__(config, module, input_shape=input_shape, seed=seed, dtype=dtype, _do_init=_do_init) + + def init_weights(self, rng: jax.random.PRNGKey, input_shape: tuple, params: FrozenDict = None) -> FrozenDict: + # init input tensors + pixel_values = jnp.zeros(input_shape, dtype=self.dtype) + + params_rng, dropout_rng = jax.random.split(rng) + dropout_rng, droppath_rng = jax.random.split(dropout_rng) + rngs = {"params": params_rng, "dropout": dropout_rng, "droppath": droppath_rng} + + random_params = self.module.init(rngs, pixel_values, return_dict=False)["params"] + + if params is not None: + random_params = flatten_dict(unfreeze(random_params)) + params = flatten_dict(unfreeze(params)) + for missing_key in self._missing_keys: + params[missing_key] = random_params[missing_key] + self._missing_keys = set() + return freeze(unflatten_dict(params)) + else: + return random_params + + @add_start_docstrings_to_model_forward(DINOV2_INPUTS_DOCSTRING.format("batch_size, sequence_length")) + def __call__( + self, + pixel_values, + params: Optional[dict] = None, + dropout_rng: jax.random.PRNGKey = None, + train: bool = False, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ): + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.return_dict + + pixel_values = jnp.transpose(pixel_values, (0, 2, 3, 1)) + # Handle any PRNG if needed + rngs = {} + if dropout_rng is not None: + dropout_rng, droppath_rng = jax.random.split(dropout_rng) + rngs["dropout"] = dropout_rng + rngs["droppath"] = droppath_rng + + return self.module.apply( + {"params": params or self.params}, + jnp.array(pixel_values, dtype=jnp.float32), + not train, + output_attentions, + output_hidden_states, + return_dict, + rngs=rngs, + ) + + +class FlaxDinov2Module(nn.Module): + config: Dinov2Config + dtype: jnp.dtype = jnp.float32 # the dtype of the computation + + def setup(self): + self.embeddings = FlaxDinov2Embeddings(self.config, dtype=self.dtype) + self.encoder = FlaxDinov2Encoder(self.config, dtype=self.dtype) + self.layernorm = nn.LayerNorm(epsilon=self.config.layer_norm_eps, dtype=self.dtype) + + def __call__( + self, + pixel_values, + deterministic: bool = True, + output_attentions: bool = False, + output_hidden_states: bool = False, + return_dict: bool = True, + ): + hidden_states = self.embeddings(pixel_values, deterministic=deterministic) + + encoder_outputs = self.encoder( + hidden_states, + deterministic=deterministic, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + sequence_output = encoder_outputs[0] + sequence_output = self.layernorm(sequence_output) + pooled_output = sequence_output[:, 0, :] + + if not return_dict: + head_outputs = (sequence_output, pooled_output) + return head_outputs + encoder_outputs[1:] + + return FlaxBaseModelOutputWithPooling( + last_hidden_state=sequence_output, + pooler_output=pooled_output, + hidden_states=encoder_outputs.hidden_states, + attentions=encoder_outputs.attentions, + ) + + +@add_start_docstrings( + "The bare Dinov2 Model transformer outputting raw hidden-states without any specific head on top.", + DINOV2_START_DOCSTRING, +) +class FlaxDinov2Model(FlaxDinov2PreTrainedModel): + module_class = FlaxDinov2Module + + +FLAX_VISION_MODEL_DOCSTRING = """ + Returns: + + Examples: + + ```python + >>> from transformers import AutoImageProcessor, FlaxDinov2Model + >>> from PIL import Image + >>> import requests + + >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" + >>> image = Image.open(requests.get(url, stream=True).raw) + + >>> image_processor = AutoImageProcessor.from_pretrained("facebook/dinov2-base") + >>> model = FlaxDinov2Model.from_pretrained("facebook/dinov2-base") + + >>> inputs = image_processor(images=image, return_tensors="np") + >>> outputs = model(**inputs) + >>> last_hidden_states = outputs.last_hidden_state + ``` +""" + +overwrite_call_docstring(FlaxDinov2Model, FLAX_VISION_MODEL_DOCSTRING) +append_replace_return_docstrings( + FlaxDinov2Model, output_type=FlaxBaseModelOutputWithPooling, config_class=Dinov2Config +) + + +class FlaxDinov2ForImageClassificationModule(nn.Module): + config: Dinov2Config + dtype: jnp.dtype = jnp.float32 + + def setup(self): + self.dinov2 = FlaxDinov2Module(config=self.config, dtype=self.dtype) + self.classifier = nn.Dense( + self.config.num_labels, + dtype=self.dtype, + kernel_init=jax.nn.initializers.variance_scaling( + self.config.initializer_range**2, "fan_in", "truncated_normal" + ), + ) + + def __call__( + self, + pixel_values=None, + deterministic: bool = True, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + ): + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + outputs = self.dinov2( + pixel_values, + deterministic=deterministic, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + hidden_states = outputs[0] + + cls_token = hidden_states[:, 0] + patch_tokens = hidden_states[:, 1:] + linear_input = jnp.concatenate([cls_token, patch_tokens.mean(axis=1)], axis=-1) + + logits = self.classifier(linear_input) + + if not return_dict: + output = (logits,) + outputs[2:] + return output + + return FlaxSequenceClassifierOutput( + logits=logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +@add_start_docstrings( + """ + Dinov2 Model transformer with an image classification head on top (a linear layer on top of the final hidden state of + the [CLS] token) e.g. for ImageNet. + """, + DINOV2_START_DOCSTRING, +) +class FlaxDinov2ForImageClassification(FlaxDinov2PreTrainedModel): + module_class = FlaxDinov2ForImageClassificationModule + + +FLAX_VISION_CLASSIFICATION_DOCSTRING = """ + Returns: + + Example: + + ```python + >>> from transformers import AutoImageProcessor, FlaxDinov2ForImageClassification + >>> from PIL import Image + >>> import jax + >>> import requests + + >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" + >>> image = Image.open(requests.get(url, stream=True).raw) + + >>> image_processor = AutoImageProcessor.from_pretrained("facebook/dinov2-base-imagenet1k-1-layer") + >>> model = FlaxDinov2ForImageClassification.from_pretrained("facebook/dinov2-base-imagenet1k-1-layer", from_pt=True) + + >>> inputs = image_processor(images=image, return_tensors="np") + >>> outputs = model(**inputs) + >>> logits = outputs.logits + + >>> # model predicts one of the 1000 ImageNet classes + >>> predicted_class_idx = jax.numpy.argmax(logits, axis=-1) + >>> print("Predicted class:", model.config.id2label[predicted_class_idx.item()]) + ``` +""" + +overwrite_call_docstring(FlaxDinov2ForImageClassification, FLAX_VISION_CLASSIFICATION_DOCSTRING) +append_replace_return_docstrings( + FlaxDinov2ForImageClassification, output_type=FlaxSequenceClassifierOutput, config_class=Dinov2Config +) + + +__all__ = ["FlaxDinov2ForImageClassification", "FlaxDinov2Model", "FlaxDinov2PreTrainedModel"] diff --git a/transformers/src/transformers/models/dinov2_with_registers/__init__.py b/transformers/src/transformers/models/dinov2_with_registers/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..2d10027b6a3b6375235a6785df044e8f0ce5fb33 --- /dev/null +++ b/transformers/src/transformers/models/dinov2_with_registers/__init__.py @@ -0,0 +1,27 @@ +# Copyright 2024 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import TYPE_CHECKING + +from ...utils import _LazyModule +from ...utils.import_utils import define_import_structure + + +if TYPE_CHECKING: + from .configuration_dinov2_with_registers import * + from .modeling_dinov2_with_registers import * +else: + import sys + + _file = globals()["__file__"] + sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__) diff --git a/transformers/src/transformers/models/dinov2_with_registers/configuration_dinov2_with_registers.py b/transformers/src/transformers/models/dinov2_with_registers/configuration_dinov2_with_registers.py new file mode 100644 index 0000000000000000000000000000000000000000..ec4f446fc684f40d634927c1e7a52b64c5732b12 --- /dev/null +++ b/transformers/src/transformers/models/dinov2_with_registers/configuration_dinov2_with_registers.py @@ -0,0 +1,159 @@ +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +# This file was automatically generated from src/transformers/models/dinov2_with_registers/modular_dinov2_with_registers.py. +# Do NOT edit this file manually as any edits will be overwritten by the generation of +# the file from the modular. If any change should be done, please apply the change to the +# modular_dinov2_with_registers.py file directly. One of our CI enforces this. +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +# coding=utf-8 +# Copyright 2024 Meta Inc. and the HuggingFace Inc. team. All rights reserved. +# +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from ...configuration_utils import PretrainedConfig +from ...utils.backbone_utils import BackboneConfigMixin, get_aligned_output_features_output_indices + + +class Dinov2WithRegistersConfig(BackboneConfigMixin, PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`Dinov2WithRegistersModel`]. It is used to instantiate an + Dinov2WithRegisters model according to the specified arguments, defining the model architecture. Instantiating a configuration + with the defaults will yield a similar configuration to that of the DINOv2 with Registers + [facebook/dinov2-with-registers-base](https://huggingface.co/facebook/dinov2-with-registers-base) architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + Args: + hidden_size (`int`, *optional*, defaults to 768): + Dimensionality of the encoder layers and the pooler layer. + num_hidden_layers (`int`, *optional*, defaults to 12): + Number of hidden layers in the Transformer encoder. + num_attention_heads (`int`, *optional*, defaults to 12): + Number of attention heads for each attention layer in the Transformer encoder. + mlp_ratio (`int`, *optional*, defaults to 4): + Ratio of the hidden size of the MLPs relative to the `hidden_size`. + hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`): + The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`, + `"relu"`, `"selu"` and `"gelu_new"` are supported. + hidden_dropout_prob (`float`, *optional*, defaults to 0.0): + The dropout probability for all fully connected layers in the embeddings, encoder, and pooler. + attention_probs_dropout_prob (`float`, *optional*, defaults to 0.0): + The dropout ratio for the attention probabilities. + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + layer_norm_eps (`float`, *optional*, defaults to 1e-06): + The epsilon used by the layer normalization layers. + image_size (`int`, *optional*, defaults to 224): + The size (resolution) of each image. + patch_size (`int`, *optional*, defaults to 16): + The size (resolution) of each patch. + num_channels (`int`, *optional*, defaults to 3): + The number of input channels. + qkv_bias (`bool`, *optional*, defaults to `True`): + Whether to add a bias to the queries, keys and values. + layerscale_value (`float`, *optional*, defaults to 1.0): + Initial value to use for layer scale. + drop_path_rate (`float`, *optional*, defaults to 0.0): + Stochastic depth rate per sample (when applied in the main path of residual layers). + use_swiglu_ffn (`bool`, *optional*, defaults to `False`): + Whether to use the SwiGLU feedforward neural network. + num_register_tokens (`int`, *optional*, defaults to 4): + Number of register tokens to use. + out_features (`list[str]`, *optional*): + If used as backbone, list of features to output. Can be any of `"stem"`, `"stage1"`, `"stage2"`, etc. + (depending on how many stages the model has). If unset and `out_indices` is set, will default to the + corresponding stages. If unset and `out_indices` is unset, will default to the last stage. Must be in the + same order as defined in the `stage_names` attribute. + out_indices (`list[int]`, *optional*): + If used as backbone, list of indices of features to output. Can be any of 0, 1, 2, etc. (depending on how + many stages the model has). If unset and `out_features` is set, will default to the corresponding stages. + If unset and `out_features` is unset, will default to the last stage. Must be in the + same order as defined in the `stage_names` attribute. + apply_layernorm (`bool`, *optional*, defaults to `True`): + Whether to apply layer normalization to the feature maps in case the model is used as backbone. + reshape_hidden_states (`bool`, *optional*, defaults to `True`): + Whether to reshape the feature maps to 4D tensors of shape `(batch_size, hidden_size, height, width)` in + case the model is used as backbone. If `False`, the feature maps will be 3D tensors of shape `(batch_size, + seq_len, hidden_size)`. + + Example: + + ```python + >>> from transformers import Dinov2WithRegistersConfig, Dinov2WithRegistersModel + + >>> # Initializing a Dinov2WithRegisters base style configuration + >>> configuration = Dinov2WithRegistersConfig() + + >>> # Initializing a model (with random weights) from the base style configuration + >>> model = Dinov2WithRegistersModel(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + + model_type = "dinov2_with_registers" + + def __init__( + self, + hidden_size=768, + num_hidden_layers=12, + num_attention_heads=12, + mlp_ratio=4, + hidden_act="gelu", + hidden_dropout_prob=0.0, + attention_probs_dropout_prob=0.0, + initializer_range=0.02, + layer_norm_eps=1e-6, + image_size=224, + patch_size=16, + num_channels=3, + qkv_bias=True, + layerscale_value=1.0, + drop_path_rate=0.0, + use_swiglu_ffn=False, + num_register_tokens=4, + out_features=None, + out_indices=None, + apply_layernorm=True, + reshape_hidden_states=True, + **kwargs, + ): + super().__init__(**kwargs) + + self.hidden_size = hidden_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.mlp_ratio = mlp_ratio + self.hidden_act = hidden_act + self.hidden_dropout_prob = hidden_dropout_prob + self.attention_probs_dropout_prob = attention_probs_dropout_prob + self.initializer_range = initializer_range + self.layer_norm_eps = layer_norm_eps + self.image_size = image_size + self.patch_size = patch_size + self.num_channels = num_channels + self.qkv_bias = qkv_bias + self.layerscale_value = layerscale_value + self.drop_path_rate = drop_path_rate + self.use_swiglu_ffn = use_swiglu_ffn + self.num_register_tokens = num_register_tokens + self.stage_names = ["stem"] + [f"stage{idx}" for idx in range(1, num_hidden_layers + 1)] + self._out_features, self._out_indices = get_aligned_output_features_output_indices( + out_features=out_features, out_indices=out_indices, stage_names=self.stage_names + ) + self.apply_layernorm = apply_layernorm + self.reshape_hidden_states = reshape_hidden_states + + +__all__ = ["Dinov2WithRegistersConfig"] diff --git a/transformers/src/transformers/models/dinov2_with_registers/convert_dinov2_with_registers_to_hf.py b/transformers/src/transformers/models/dinov2_with_registers/convert_dinov2_with_registers_to_hf.py new file mode 100644 index 0000000000000000000000000000000000000000..0ff2697f74667e1b941ec65adb1a39cfd0a87460 --- /dev/null +++ b/transformers/src/transformers/models/dinov2_with_registers/convert_dinov2_with_registers_to_hf.py @@ -0,0 +1,291 @@ +# coding=utf-8 +# Copyright 2024 The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Convert DINOv2 with Registers checkpoints from the original repository. + +URL: https://github.com/facebookresearch/dinov2/tree/main +""" + +import argparse +import json +from pathlib import Path + +import requests +import torch +import torch.nn as nn +from huggingface_hub import hf_hub_download +from PIL import Image +from torchvision import transforms + +from transformers import ( + BitImageProcessor, + Dinov2WithRegistersConfig, + Dinov2WithRegistersForImageClassification, + Dinov2WithRegistersModel, +) +from transformers.image_utils import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD, PILImageResampling +from transformers.utils import logging + + +logging.set_verbosity_info() +logger = logging.get_logger(__name__) + + +def get_dinov2_with_registers_config(model_name, image_classifier=False): + config = Dinov2WithRegistersConfig(image_size=518, patch_size=14) + + # size of the architecture + if "vits" in model_name: + config.hidden_size = 384 + config.num_attention_heads = 6 + elif "vitb" in model_name: + pass + elif "vitl" in model_name: + config.hidden_size = 1024 + config.num_hidden_layers = 24 + config.num_attention_heads = 16 + elif "vitg" in model_name: + config.use_swiglu_ffn = True + config.hidden_size = 1536 + config.num_hidden_layers = 40 + config.num_attention_heads = 24 + else: + raise ValueError("Model not supported") + + if image_classifier: + repo_id = "huggingface/label-files" + filename = "imagenet-1k-id2label.json" + config.num_labels = 1000 + config.id2label = json.load(open(hf_hub_download(repo_id, filename, repo_type="dataset"), "r")) + config.id2label = {int(k): v for k, v in config.id2label.items()} + + return config + + +def create_rename_keys(config): + rename_keys = [] + # fmt: off + + # patch embedding layer + rename_keys.append(("cls_token", "embeddings.cls_token")) + rename_keys.append(("mask_token", "embeddings.mask_token")) + rename_keys.append(("pos_embed", "embeddings.position_embeddings")) + rename_keys.append(("register_tokens", "embeddings.register_tokens")) + rename_keys.append(("patch_embed.proj.weight", "embeddings.patch_embeddings.projection.weight")) + rename_keys.append(("patch_embed.proj.bias", "embeddings.patch_embeddings.projection.bias")) + + for i in range(config.num_hidden_layers): + # layernorms + rename_keys.append((f"blocks.{i}.norm1.weight", f"encoder.layer.{i}.norm1.weight")) + rename_keys.append((f"blocks.{i}.norm1.bias", f"encoder.layer.{i}.norm1.bias")) + rename_keys.append((f"blocks.{i}.norm2.weight", f"encoder.layer.{i}.norm2.weight")) + rename_keys.append((f"blocks.{i}.norm2.bias", f"encoder.layer.{i}.norm2.bias")) + # MLP + if config.use_swiglu_ffn: + rename_keys.append((f"blocks.{i}.mlp.w12.weight", f"encoder.layer.{i}.mlp.w12.weight")) + rename_keys.append((f"blocks.{i}.mlp.w12.bias", f"encoder.layer.{i}.mlp.w12.bias")) + rename_keys.append((f"blocks.{i}.mlp.w3.weight", f"encoder.layer.{i}.mlp.w3.weight")) + rename_keys.append((f"blocks.{i}.mlp.w3.bias", f"encoder.layer.{i}.mlp.w3.bias")) + else: + rename_keys.append((f"blocks.{i}.mlp.fc1.weight", f"encoder.layer.{i}.mlp.fc1.weight")) + rename_keys.append((f"blocks.{i}.mlp.fc1.bias", f"encoder.layer.{i}.mlp.fc1.bias")) + rename_keys.append((f"blocks.{i}.mlp.fc2.weight", f"encoder.layer.{i}.mlp.fc2.weight")) + rename_keys.append((f"blocks.{i}.mlp.fc2.bias", f"encoder.layer.{i}.mlp.fc2.bias")) + # layerscale + rename_keys.append((f"blocks.{i}.ls1.gamma", f"encoder.layer.{i}.layer_scale1.lambda1")) + rename_keys.append((f"blocks.{i}.ls2.gamma", f"encoder.layer.{i}.layer_scale2.lambda1")) + # attention projection layer + rename_keys.append((f"blocks.{i}.attn.proj.weight", f"encoder.layer.{i}.attention.output.dense.weight")) + rename_keys.append((f"blocks.{i}.attn.proj.bias", f"encoder.layer.{i}.attention.output.dense.bias")) + + # final layernorm + rename_keys.append(("norm.weight", "layernorm.weight")) + rename_keys.append(("norm.bias", "layernorm.bias")) + + # fmt: on + return rename_keys + + +def rename_key(dct, old, new): + val = dct.pop(old) + dct[new] = val + + +# we split up the matrix of each encoder layer into queries, keys and values +def read_in_q_k_v(state_dict, config): + for i in range(config.num_hidden_layers): + # read in weights + bias of input projection layer (in timm, this is a single matrix + bias) + in_proj_weight = state_dict.pop(f"blocks.{i}.attn.qkv.weight") + in_proj_bias = state_dict.pop(f"blocks.{i}.attn.qkv.bias") + # next, add query, keys and values (in that order) to the state dict + state_dict[f"encoder.layer.{i}.attention.attention.query.weight"] = in_proj_weight[: config.hidden_size, :] + state_dict[f"encoder.layer.{i}.attention.attention.query.bias"] = in_proj_bias[: config.hidden_size] + state_dict[f"encoder.layer.{i}.attention.attention.key.weight"] = in_proj_weight[ + config.hidden_size : config.hidden_size * 2, : + ] + state_dict[f"encoder.layer.{i}.attention.attention.key.bias"] = in_proj_bias[ + config.hidden_size : config.hidden_size * 2 + ] + state_dict[f"encoder.layer.{i}.attention.attention.value.weight"] = in_proj_weight[-config.hidden_size :, :] + state_dict[f"encoder.layer.{i}.attention.attention.value.bias"] = in_proj_bias[-config.hidden_size :] + + +# We will verify our results on an image of cute cats +def prepare_img(): + url = "http://images.cocodataset.org/val2017/000000039769.jpg" + image = Image.open(requests.get(url, stream=True).raw).convert("RGB") + return image + + +@torch.no_grad() +def convert_dinov2_with_registers_checkpoint(model_name, pytorch_dump_folder_path, push_to_hub=False): + """ + Copy/paste/tweak model's weights to our Dinov2WithRegisters structure. + """ + + # define default Dinov2WithRegisters configuration + image_classifier = "1layer" in model_name + config = get_dinov2_with_registers_config(model_name, image_classifier=image_classifier) + + # load original model from torch hub + original_model = torch.hub.load("facebookresearch/dinov2", model_name.replace("_1layer", "")) + original_model.eval() + + # load state_dict of original model, remove and rename some keys + state_dict = original_model.state_dict() + rename_keys = create_rename_keys(config) + for src, dest in rename_keys: + rename_key(state_dict, src, dest) + read_in_q_k_v(state_dict, config) + + for key, val in state_dict.copy().items(): + val = state_dict.pop(key) + if "w12" in key: + key = key.replace("w12", "weights_in") + if "w3" in key: + key = key.replace("w3", "weights_out") + state_dict[key] = val + + # load HuggingFace model + if image_classifier: + model = Dinov2WithRegistersForImageClassification(config).eval() + model.dinov2_with_registers.load_state_dict(state_dict) + model_name_to_classifier_dict_url = { + "dinov2_vits14_reg_1layer": "https://dl.fbaipublicfiles.com/dinov2/dinov2_vits14/dinov2_vits14_reg4_linear_head.pth", + "dinov2_vitb14_reg_1layer": "https://dl.fbaipublicfiles.com/dinov2/dinov2_vitb14/dinov2_vitb14_reg4_linear_head.pth", + "dinov2_vitl14_reg_1layer": "https://dl.fbaipublicfiles.com/dinov2/dinov2_vitl14/dinov2_vitl14_reg4_linear_head.pth", + "dinov2_vitg14_reg_1layer": "https://dl.fbaipublicfiles.com/dinov2/dinov2_vitg14/dinov2_vitg14_reg4_linear_head.pth", + } + url = model_name_to_classifier_dict_url[model_name] + classifier_state_dict = torch.hub.load_state_dict_from_url(url, map_location="cpu") + model.classifier.weight = nn.Parameter(classifier_state_dict["weight"]) + model.classifier.bias = nn.Parameter(classifier_state_dict["bias"]) + else: + model = Dinov2WithRegistersModel(config).eval() + model.load_state_dict(state_dict) + + # load image + image = prepare_img() + + # preprocess image + transformations = transforms.Compose( + [ + transforms.Resize(256, interpolation=transforms.InterpolationMode.BICUBIC), + transforms.CenterCrop(224), + transforms.ToTensor(), + transforms.Normalize( + mean=IMAGENET_DEFAULT_MEAN, # these are RGB mean+std values + std=IMAGENET_DEFAULT_STD, # across a large photo dataset. + ), + ] + ) + + original_pixel_values = transformations(image).unsqueeze(0) # insert batch dimension + + processor = BitImageProcessor( + size={"shortest_edge": 256}, + resample=PILImageResampling.BICUBIC, + image_mean=IMAGENET_DEFAULT_MEAN, + image_std=IMAGENET_DEFAULT_STD, + ) + pixel_values = processor(image, return_tensors="pt").pixel_values + + assert torch.allclose(original_pixel_values, pixel_values) + + with torch.no_grad(): + outputs = model(pixel_values, output_hidden_states=True) + original_outputs = original_model(pixel_values) + + # assert values + if image_classifier: + print("Predicted class:") + class_idx = outputs.logits.argmax(-1).item() + print(model.config.id2label[class_idx]) + else: + assert outputs.last_hidden_state[:, 0].shape == original_outputs.shape + assert torch.allclose(outputs.last_hidden_state[:, 0], original_outputs, atol=1e-3) + print("Looks ok!") + + if pytorch_dump_folder_path is not None: + Path(pytorch_dump_folder_path).mkdir(exist_ok=True) + print(f"Saving model {model_name} to {pytorch_dump_folder_path}") + model.save_pretrained(pytorch_dump_folder_path) + print(f"Saving image processor to {pytorch_dump_folder_path}") + processor.save_pretrained(pytorch_dump_folder_path) + + if push_to_hub: + model_name_to_hf_name = { + "dinov2_vits14_reg": "dinov2-with-registers-small", + "dinov2_vitb14_reg": "dinov2-with-registers-base", + "dinov2_vitl14_reg": "dinov2-with-registers-large", + "dinov2_vitg14_reg": "dinov2-with-registers-giant", + "dinov2_vits14_reg_1layer": "dinov2-with-registers-small-imagenet1k-1-layer", + "dinov2_vitb14_reg_1layer": "dinov2-with-registers-base-imagenet1k-1-layer", + "dinov2_vitl14_reg_1layer": "dinov2-with-registers-large-imagenet1k-1-layer", + "dinov2_vitg14_reg_1layer": "dinov2-with-registers-giant-imagenet1k-1-layer", + } + + name = model_name_to_hf_name[model_name] + model.push_to_hub(f"nielsr/{name}") + processor.push_to_hub(f"nielsr/{name}") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + # Required parameters + parser.add_argument( + "--model_name", + default="dinov2_vits14_reg", + type=str, + choices=[ + "dinov2_vits14_reg", + "dinov2_vitb14_reg", + "dinov2_vitl14_reg", + "dinov2_vitg14_reg", + "dinov2_vits14_reg_1layer", + "dinov2_vitb14_reg_1layer", + "dinov2_vitl14_reg_1layer", + "dinov2_vitg14_reg_1layer", + ], + help="Name of the model you'd like to convert.", + ) + parser.add_argument( + "--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model directory." + ) + parser.add_argument( + "--push_to_hub", action="store_true", help="Whether or not to push the converted model to the 🤗 hub." + ) + + args = parser.parse_args() + convert_dinov2_with_registers_checkpoint(args.model_name, args.pytorch_dump_folder_path, args.push_to_hub) diff --git a/transformers/src/transformers/models/dinov2_with_registers/modeling_dinov2_with_registers.py b/transformers/src/transformers/models/dinov2_with_registers/modeling_dinov2_with_registers.py new file mode 100644 index 0000000000000000000000000000000000000000..7d37f00daa91303519362e438d023a5db6532ccf --- /dev/null +++ b/transformers/src/transformers/models/dinov2_with_registers/modeling_dinov2_with_registers.py @@ -0,0 +1,822 @@ +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +# This file was automatically generated from src/transformers/models/dinov2_with_registers/modular_dinov2_with_registers.py. +# Do NOT edit this file manually as any edits will be overwritten by the generation of +# the file from the modular. If any change should be done, please apply the change to the +# modular_dinov2_with_registers.py file directly. One of our CI enforces this. +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +# coding=utf-8 +# Copyright 2024 Meta Inc. and the HuggingFace Inc. team. All rights reserved. +# +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import collections.abc +from typing import Callable, Optional, Union + +import torch +from torch import nn +from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss + +from ...activations import ACT2FN +from ...modeling_layers import GradientCheckpointingLayer +from ...modeling_outputs import BackboneOutput, BaseModelOutput, BaseModelOutputWithPooling, ImageClassifierOutput +from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel +from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer +from ...utils import auto_docstring, logging, torch_int +from ...utils.backbone_utils import BackboneMixin +from .configuration_dinov2_with_registers import Dinov2WithRegistersConfig + + +logger = logging.get_logger(__name__) + + +class Dinov2WithRegistersPatchEmbeddings(nn.Module): + """ + This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial + `hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a + Transformer. + """ + + def __init__(self, config): + super().__init__() + image_size, patch_size = config.image_size, config.patch_size + num_channels, hidden_size = config.num_channels, config.hidden_size + + image_size = image_size if isinstance(image_size, collections.abc.Iterable) else (image_size, image_size) + patch_size = patch_size if isinstance(patch_size, collections.abc.Iterable) else (patch_size, patch_size) + num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0]) + self.image_size = image_size + self.patch_size = patch_size + self.num_channels = num_channels + self.num_patches = num_patches + + self.projection = nn.Conv2d(num_channels, hidden_size, kernel_size=patch_size, stride=patch_size) + + def forward(self, pixel_values: torch.Tensor) -> torch.Tensor: + num_channels = pixel_values.shape[1] + if num_channels != self.num_channels: + raise ValueError( + "Make sure that the channel dimension of the pixel values match with the one set in the configuration." + f" Expected {self.num_channels} but got {num_channels}." + ) + embeddings = self.projection(pixel_values).flatten(2).transpose(1, 2) + return embeddings + + +class Dinov2WithRegistersEmbeddings(nn.Module): + """ + Construct the CLS token, mask token, register tokens, position and patch embeddings. + """ + + def __init__(self, config: Dinov2WithRegistersConfig) -> None: + super().__init__() + + self.cls_token = nn.Parameter(torch.randn(1, 1, config.hidden_size)) + self.mask_token = nn.Parameter(torch.zeros(1, config.hidden_size)) + self.register_tokens = nn.Parameter(torch.zeros(1, config.num_register_tokens, config.hidden_size)) + self.patch_embeddings = Dinov2WithRegistersPatchEmbeddings(config) + num_patches = self.patch_embeddings.num_patches + self.position_embeddings = nn.Parameter(torch.randn(1, num_patches + 1, config.hidden_size)) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + self.patch_size = config.patch_size + self.config = config + + def interpolate_pos_encoding(self, embeddings: torch.Tensor, height: int, width: int) -> torch.Tensor: + """ + This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher + resolution images. This implementation supports torch.jit tracing while maintaining backwards compatibility + with the original implementation. + + Adapted from: + - https://github.com/facebookresearch/dino/blob/main/vision_transformer.py + - https://github.com/facebookresearch/dinov2/blob/main/dinov2/models/vision_transformer.py + """ + num_patches = embeddings.shape[1] - 1 + num_positions = self.position_embeddings.shape[1] - 1 + + # Skip interpolation for matching dimensions (unless tracing) + if not torch.jit.is_tracing() and num_patches == num_positions and height == width: + return self.position_embeddings + + # Handle class token and patch embeddings separately + class_pos_embed = self.position_embeddings[:, 0] + patch_pos_embed = self.position_embeddings[:, 1:] + dim = embeddings.shape[-1] + + # Calculate new dimensions + height = height // self.config.patch_size + width = width // self.config.patch_size + + # Reshape for interpolation + sqrt_num_positions = torch_int(num_positions**0.5) + patch_pos_embed = patch_pos_embed.reshape(1, sqrt_num_positions, sqrt_num_positions, dim) + patch_pos_embed = patch_pos_embed.permute(0, 3, 1, 2) + + # Store original dtype for restoration after interpolation + target_dtype = patch_pos_embed.dtype + + # Interpolate at float32 precision + patch_pos_embed = nn.functional.interpolate( + patch_pos_embed.to(dtype=torch.float32), + size=(torch_int(height), torch_int(width)), # Explicit size instead of scale_factor + mode="bicubic", + align_corners=False, + antialias=True, + ).to(dtype=target_dtype) + + # Validate output dimensions if not tracing + if not torch.jit.is_tracing(): + if int(height) != patch_pos_embed.shape[-2] or int(width) != patch_pos_embed.shape[-1]: + raise ValueError("Width or height does not match with the interpolated position embeddings") + + # Reshape back to original format + patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).view(1, -1, dim) + + # Combine class and patch embeddings + return torch.cat((class_pos_embed.unsqueeze(0), patch_pos_embed), dim=1) + + def forward(self, pixel_values: torch.Tensor, bool_masked_pos: Optional[torch.Tensor] = None) -> torch.Tensor: + batch_size, _, height, width = pixel_values.shape + target_dtype = self.patch_embeddings.projection.weight.dtype + embeddings = self.patch_embeddings(pixel_values.to(dtype=target_dtype)) + + if bool_masked_pos is not None: + embeddings = torch.where( + bool_masked_pos.unsqueeze(-1), self.mask_token.to(embeddings.dtype).unsqueeze(0), embeddings + ) + + # add the [CLS] token to the embedded patch tokens + cls_tokens = self.cls_token.expand(batch_size, -1, -1) + embeddings = torch.cat((cls_tokens, embeddings), dim=1) + + # add positional encoding to each token + embeddings = embeddings + self.interpolate_pos_encoding(embeddings, height, width) + + # add register tokens + embeddings = torch.cat( + (embeddings[:, :1], self.register_tokens.expand(embeddings.shape[0], -1, -1), embeddings[:, 1:]), dim=1 + ) + + embeddings = self.dropout(embeddings) + + return embeddings + + +def eager_attention_forward( + module: nn.Module, + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + attention_mask: Optional[torch.Tensor], + scaling: float, + dropout: float = 0.0, + **kwargs, +): + # Take the dot product between "query" and "key" to get the raw attention scores. + attn_weights = torch.matmul(query, key.transpose(-1, -2)) * scaling + + # Normalize the attention scores to probabilities. + attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype) + + # This is actually dropping out entire tokens to attend to, which might + # seem a bit unusual, but is taken from the original Transformer paper. + attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training) + + # Mask heads if we want to + if attention_mask is not None: + attn_weights = attn_weights * attention_mask + + attn_output = torch.matmul(attn_weights, value) + attn_output = attn_output.transpose(1, 2).contiguous() + + return attn_output, attn_weights + + +class Dinov2WithRegistersSelfAttention(nn.Module): + def __init__(self, config: Dinov2WithRegistersConfig) -> None: + super().__init__() + if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"): + raise ValueError( + f"The hidden size {config.hidden_size} is not a multiple of the number of attention " + f"heads {config.num_attention_heads}." + ) + + self.config = config + self.num_attention_heads = config.num_attention_heads + self.attention_head_size = int(config.hidden_size / config.num_attention_heads) + self.all_head_size = self.num_attention_heads * self.attention_head_size + self.dropout_prob = config.attention_probs_dropout_prob + self.scaling = self.attention_head_size**-0.5 + self.is_causal = False + + self.query = nn.Linear(config.hidden_size, self.all_head_size, bias=config.qkv_bias) + self.key = nn.Linear(config.hidden_size, self.all_head_size, bias=config.qkv_bias) + self.value = nn.Linear(config.hidden_size, self.all_head_size, bias=config.qkv_bias) + + def transpose_for_scores(self, x: torch.Tensor) -> torch.Tensor: + new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size) + x = x.view(new_x_shape) + return x.permute(0, 2, 1, 3) + + def forward( + self, hidden_states, head_mask: Optional[torch.Tensor] = None, output_attentions: bool = False + ) -> Union[tuple[torch.Tensor, torch.Tensor], tuple[torch.Tensor]]: + key_layer = self.transpose_for_scores(self.key(hidden_states)) + value_layer = self.transpose_for_scores(self.value(hidden_states)) + query_layer = self.transpose_for_scores(self.query(hidden_states)) + + attention_interface: Callable = eager_attention_forward + if self.config._attn_implementation != "eager": + if self.config._attn_implementation == "sdpa" and output_attentions: + logger.warning_once( + "`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to " + 'eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.' + ) + else: + attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation] + + context_layer, attention_probs = attention_interface( + self, + query_layer, + key_layer, + value_layer, + head_mask, + is_causal=self.is_causal, + scaling=self.scaling, + dropout=0.0 if not self.training else self.dropout_prob, + ) + + new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,) + context_layer = context_layer.reshape(new_context_layer_shape) + + outputs = (context_layer, attention_probs) if output_attentions else (context_layer,) + + return outputs + + +class Dinov2WithRegistersSelfOutput(nn.Module): + """ + The residual connection is defined in Dinov2WithRegistersLayer instead of here (as is the case with other models), due to the + layernorm applied before each block. + """ + + def __init__(self, config: Dinov2WithRegistersConfig) -> None: + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.hidden_size) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor: + hidden_states = self.dense(hidden_states) + hidden_states = self.dropout(hidden_states) + + return hidden_states + + +class Dinov2WithRegistersAttention(nn.Module): + def __init__(self, config: Dinov2WithRegistersConfig) -> None: + super().__init__() + self.attention = Dinov2WithRegistersSelfAttention(config) + self.output = Dinov2WithRegistersSelfOutput(config) + self.pruned_heads = set() + + def prune_heads(self, heads: set[int]) -> None: + if len(heads) == 0: + return + heads, index = find_pruneable_heads_and_indices( + heads, self.attention.num_attention_heads, self.attention.attention_head_size, self.pruned_heads + ) + + # Prune linear layers + self.attention.query = prune_linear_layer(self.attention.query, index) + self.attention.key = prune_linear_layer(self.attention.key, index) + self.attention.value = prune_linear_layer(self.attention.value, index) + self.output.dense = prune_linear_layer(self.output.dense, index, dim=1) + + # Update hyper params and store pruned heads + self.attention.num_attention_heads = self.attention.num_attention_heads - len(heads) + self.attention.all_head_size = self.attention.attention_head_size * self.attention.num_attention_heads + self.pruned_heads = self.pruned_heads.union(heads) + + def forward( + self, + hidden_states: torch.Tensor, + head_mask: Optional[torch.Tensor] = None, + output_attentions: bool = False, + ) -> Union[tuple[torch.Tensor, torch.Tensor], tuple[torch.Tensor]]: + self_outputs = self.attention(hidden_states, head_mask, output_attentions) + + attention_output = self.output(self_outputs[0], hidden_states) + + outputs = (attention_output,) + self_outputs[1:] # add attentions if we output them + return outputs + + +class Dinov2WithRegistersLayerScale(nn.Module): + def __init__(self, config) -> None: + super().__init__() + self.lambda1 = nn.Parameter(config.layerscale_value * torch.ones(config.hidden_size)) + + def forward(self, hidden_state: torch.Tensor) -> torch.Tensor: + return hidden_state * self.lambda1 + + +def drop_path(input: torch.Tensor, drop_prob: float = 0.0, training: bool = False) -> torch.Tensor: + """ + Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks). + + Comment by Ross Wightman: This is the same as the DropConnect impl I created for EfficientNet, etc networks, + however, the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper... + See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for changing the + layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use 'survival rate' as the + argument. + """ + if drop_prob == 0.0 or not training: + return input + keep_prob = 1 - drop_prob + shape = (input.shape[0],) + (1,) * (input.ndim - 1) # work with diff dim tensors, not just 2D ConvNets + random_tensor = keep_prob + torch.rand(shape, dtype=input.dtype, device=input.device) + random_tensor.floor_() # binarize + output = input.div(keep_prob) * random_tensor + return output + + +class Dinov2WithRegistersDropPath(nn.Module): + """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).""" + + def __init__(self, drop_prob: Optional[float] = None) -> None: + super().__init__() + self.drop_prob = drop_prob + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + return drop_path(hidden_states, self.drop_prob, self.training) + + def extra_repr(self) -> str: + return f"p={self.drop_prob}" + + +class Dinov2WithRegistersMLP(nn.Module): + def __init__(self, config) -> None: + super().__init__() + in_features = out_features = config.hidden_size + hidden_features = int(config.hidden_size * config.mlp_ratio) + self.fc1 = nn.Linear(in_features, hidden_features, bias=True) + if isinstance(config.hidden_act, str): + self.activation = ACT2FN[config.hidden_act] + else: + self.activation = config.hidden_act + self.fc2 = nn.Linear(hidden_features, out_features, bias=True) + + def forward(self, hidden_state: torch.Tensor) -> torch.Tensor: + hidden_state = self.fc1(hidden_state) + hidden_state = self.activation(hidden_state) + hidden_state = self.fc2(hidden_state) + return hidden_state + + +class Dinov2WithRegistersSwiGLUFFN(nn.Module): + def __init__(self, config) -> None: + super().__init__() + in_features = out_features = config.hidden_size + hidden_features = int(config.hidden_size * config.mlp_ratio) + hidden_features = (int(hidden_features * 2 / 3) + 7) // 8 * 8 + + self.weights_in = nn.Linear(in_features, 2 * hidden_features, bias=True) + self.weights_out = nn.Linear(hidden_features, out_features, bias=True) + + def forward(self, hidden_state: torch.Tensor) -> torch.Tensor: + hidden_state = self.weights_in(hidden_state) + x1, x2 = hidden_state.chunk(2, dim=-1) + hidden = nn.functional.silu(x1) * x2 + return self.weights_out(hidden) + + +class Dinov2WithRegistersLayer(GradientCheckpointingLayer): + """This corresponds to the Block class in the original implementation.""" + + def __init__(self, config: Dinov2WithRegistersConfig) -> None: + super().__init__() + + self.norm1 = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.attention = Dinov2WithRegistersAttention(config) + self.layer_scale1 = Dinov2WithRegistersLayerScale(config) + self.drop_path = ( + Dinov2WithRegistersDropPath(config.drop_path_rate) if config.drop_path_rate > 0.0 else nn.Identity() + ) + + self.norm2 = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + + if config.use_swiglu_ffn: + self.mlp = Dinov2WithRegistersSwiGLUFFN(config) + else: + self.mlp = Dinov2WithRegistersMLP(config) + self.layer_scale2 = Dinov2WithRegistersLayerScale(config) + + def forward( + self, + hidden_states: torch.Tensor, + head_mask: Optional[torch.Tensor] = None, + output_attentions: bool = False, + ) -> Union[tuple[torch.Tensor, torch.Tensor], tuple[torch.Tensor]]: + self_attention_outputs = self.attention( + self.norm1(hidden_states), # in Dinov2WithRegisters, layernorm is applied before self-attention + head_mask, + output_attentions=output_attentions, + ) + attention_output = self_attention_outputs[0] + + attention_output = self.layer_scale1(attention_output) + outputs = self_attention_outputs[1:] # add self attentions if we output attention weights + + # first residual connection + hidden_states = self.drop_path(attention_output) + hidden_states + + # in Dinov2WithRegisters, layernorm is also applied after self-attention + layer_output = self.norm2(hidden_states) + layer_output = self.mlp(layer_output) + layer_output = self.layer_scale2(layer_output) + + # second residual connection + layer_output = self.drop_path(layer_output) + hidden_states + + outputs = (layer_output,) + outputs + + return outputs + + +class Dinov2WithRegistersEncoder(nn.Module): + def __init__(self, config: Dinov2WithRegistersConfig) -> None: + super().__init__() + self.config = config + self.layer = nn.ModuleList([Dinov2WithRegistersLayer(config) for _ in range(config.num_hidden_layers)]) + self.gradient_checkpointing = False + + def forward( + self, + hidden_states: torch.Tensor, + head_mask: Optional[torch.Tensor] = None, + output_attentions: bool = False, + output_hidden_states: bool = False, + return_dict: bool = True, + ) -> Union[tuple, BaseModelOutput]: + all_hidden_states = () if output_hidden_states else None + all_self_attentions = () if output_attentions else None + + for i, layer_module in enumerate(self.layer): + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + layer_head_mask = head_mask[i] if head_mask is not None else None + + layer_outputs = layer_module(hidden_states, layer_head_mask, output_attentions) + + hidden_states = layer_outputs[0] + + if output_attentions: + all_self_attentions = all_self_attentions + (layer_outputs[1],) + + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + if not return_dict: + return tuple(v for v in [hidden_states, all_hidden_states, all_self_attentions] if v is not None) + return BaseModelOutput( + last_hidden_state=hidden_states, + hidden_states=all_hidden_states, + attentions=all_self_attentions, + ) + + +@auto_docstring +class Dinov2WithRegistersPreTrainedModel(PreTrainedModel): + config_class = Dinov2WithRegistersConfig + base_model_prefix = "dinov2_with_registers" + main_input_name = "pixel_values" + supports_gradient_checkpointing = True + _no_split_modules = ["Dinov2WithRegistersLayer"] + _supports_sdpa = True + _supports_flash_attn_2 = True + _supports_flash_attn_3 = True + _supports_flex_attn = True + _supports_attention_backend = True + + def _init_weights(self, module: Union[nn.Linear, nn.Conv2d, nn.LayerNorm]) -> None: + """Initialize the weights""" + if isinstance(module, (nn.Linear, nn.Conv2d)): + # Upcast the input in `fp32` and cast it back to desired `dtype` to avoid + # `trunc_normal_cpu` not implemented in `half` issues + module.weight.data = nn.init.trunc_normal_( + module.weight.data.to(torch.float32), mean=0.0, std=self.config.initializer_range + ).to(module.weight.dtype) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.LayerNorm): + module.bias.data.zero_() + module.weight.data.fill_(1.0) + elif isinstance(module, Dinov2WithRegistersEmbeddings): + module.position_embeddings.data = nn.init.trunc_normal_( + module.position_embeddings.data.to(torch.float32), + mean=0.0, + std=self.config.initializer_range, + ).to(module.position_embeddings.dtype) + + module.cls_token.data = nn.init.trunc_normal_( + module.cls_token.data.to(torch.float32), + mean=0.0, + std=self.config.initializer_range, + ).to(module.cls_token.dtype) + + module.mask_token.data.zero_() + module.register_tokens.data.zero_() + elif isinstance(module, Dinov2WithRegistersLayerScale): # noqa: F821 + module.lambda1.data.fill_(self.config.layerscale_value) + + +@auto_docstring +class Dinov2WithRegistersModel(Dinov2WithRegistersPreTrainedModel): + def __init__(self, config: Dinov2WithRegistersConfig): + super().__init__(config) + self.config = config + + self.embeddings = Dinov2WithRegistersEmbeddings(config) + self.encoder = Dinov2WithRegistersEncoder(config) + + self.layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self) -> Dinov2WithRegistersPatchEmbeddings: + return self.embeddings.patch_embeddings + + def _prune_heads(self, heads_to_prune: dict[int, list[int]]) -> None: + """ + Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base + class PreTrainedModel + """ + for layer, heads in heads_to_prune.items(): + self.encoder.layer[layer].attention.prune_heads(heads) + + @auto_docstring + def forward( + self, + pixel_values: Optional[torch.Tensor] = None, + bool_masked_pos: Optional[torch.Tensor] = None, + head_mask: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[tuple, BaseModelOutputWithPooling]: + r""" + bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, sequence_length)`): + Boolean masked positions. Indicates which patches are masked (1) and which aren't (0). Only relevant for + pre-training. + """ + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if pixel_values is None: + raise ValueError("You have to specify pixel_values") + + # Prepare head mask if needed + # 1.0 in head_mask indicate we keep the head + # attention_probs has shape bsz x n_heads x N x N + # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads] + # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length] + head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers) + + embedding_output = self.embeddings(pixel_values, bool_masked_pos=bool_masked_pos) + + encoder_outputs = self.encoder( + embedding_output, + head_mask=head_mask, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + sequence_output = encoder_outputs[0] + sequence_output = self.layernorm(sequence_output) + pooled_output = sequence_output[:, 0, :] + + if not return_dict: + head_outputs = (sequence_output, pooled_output) + return head_outputs + encoder_outputs[1:] + + return BaseModelOutputWithPooling( + last_hidden_state=sequence_output, + pooler_output=pooled_output, + hidden_states=encoder_outputs.hidden_states, + attentions=encoder_outputs.attentions, + ) + + +@auto_docstring( + custom_intro=""" + Dinov2WithRegisters Model transformer with an image classification head on top (a linear layer on top of the final hidden state + of the [CLS] token) e.g. for ImageNet. + """ +) +class Dinov2WithRegistersForImageClassification(Dinov2WithRegistersPreTrainedModel): + def __init__(self, config: Dinov2WithRegistersConfig) -> None: + super().__init__(config) + + self.num_labels = config.num_labels + self.dinov2_with_registers = Dinov2WithRegistersModel(config) + + # Classifier head + self.classifier = ( + nn.Linear(config.hidden_size * 2, config.num_labels) if config.num_labels > 0 else nn.Identity() + ) + + # Initialize weights and apply final processing + self.post_init() + + @auto_docstring + def forward( + self, + pixel_values: Optional[torch.Tensor] = None, + head_mask: Optional[torch.Tensor] = None, + labels: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[tuple, ImageClassifierOutput]: + r""" + labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels for computing the image classification/regression loss. Indices should be in `[0, ..., + config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If + `config.num_labels > 1` a classification loss is computed (Cross-Entropy). + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + outputs = self.dinov2_with_registers( + pixel_values, + head_mask=head_mask, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + sequence_output = outputs[0] # batch_size, sequence_length, hidden_size + + cls_token = sequence_output[:, 0] + # cls and register tokens should not be included in patch tokens variable + patch_tokens = sequence_output[:, 1 + self.config.num_register_tokens :] + + linear_input = torch.cat([cls_token, patch_tokens.mean(dim=1)], dim=1) + + logits = self.classifier(linear_input) + + loss = None + if labels is not None: + # move labels to correct device to enable model parallelism + labels = labels.to(logits.device) + if self.config.problem_type is None: + if self.num_labels == 1: + self.config.problem_type = "regression" + elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int): + self.config.problem_type = "single_label_classification" + else: + self.config.problem_type = "multi_label_classification" + + if self.config.problem_type == "regression": + loss_fct = MSELoss() + if self.num_labels == 1: + loss = loss_fct(logits.squeeze(), labels.squeeze()) + else: + loss = loss_fct(logits, labels) + elif self.config.problem_type == "single_label_classification": + loss_fct = CrossEntropyLoss() + loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) + elif self.config.problem_type == "multi_label_classification": + loss_fct = BCEWithLogitsLoss() + loss = loss_fct(logits, labels) + + if not return_dict: + output = (logits,) + outputs[2:] + return ((loss,) + output) if loss is not None else output + + return ImageClassifierOutput( + loss=loss, + logits=logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +@auto_docstring( + custom_intro=""" + Dinov2WithRegisters backbone, to be used with frameworks like DETR and MaskFormer. + """ +) +class Dinov2WithRegistersBackbone(Dinov2WithRegistersPreTrainedModel, BackboneMixin): + def __init__(self, config): + super().__init__(config) + super()._init_backbone(config) + self.num_features = [config.hidden_size for _ in range(config.num_hidden_layers + 1)] + self.embeddings = Dinov2WithRegistersEmbeddings(config) + self.encoder = Dinov2WithRegistersEncoder(config) + + self.layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + + self.num_register_tokens = config.num_register_tokens + + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self) -> Dinov2WithRegistersPatchEmbeddings: + return self.embeddings.patch_embeddings + + @auto_docstring + def forward( + self, + pixel_values: torch.Tensor, + output_hidden_states: Optional[bool] = None, + output_attentions: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> BackboneOutput: + r""" + Examples: + + ```python + >>> from transformers import AutoImageProcessor, AutoBackbone + >>> import torch + >>> from PIL import Image + >>> import requests + + >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" + >>> image = Image.open(requests.get(url, stream=True).raw) + + >>> processor = AutoImageProcessor.from_pretrained("facebook/dinov2-with-registers-base") + >>> model = AutoBackbone.from_pretrained( + ... "facebook/dinov2-with-registers-base", out_features=["stage2", "stage5", "stage8", "stage11"] + ... ) + + >>> inputs = processor(image, return_tensors="pt") + + >>> outputs = model(**inputs) + >>> feature_maps = outputs.feature_maps + >>> list(feature_maps[-1].shape) + [1, 768, 16, 16] + ```""" + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + + embedding_output = self.embeddings(pixel_values) + + outputs = self.encoder( + embedding_output, output_hidden_states=True, output_attentions=output_attentions, return_dict=return_dict + ) + + hidden_states = outputs.hidden_states if return_dict else outputs[1] + + feature_maps = () + for stage, hidden_state in zip(self.stage_names, hidden_states): + if stage in self.out_features: + if self.config.apply_layernorm: + hidden_state = self.layernorm(hidden_state) + if self.config.reshape_hidden_states: + hidden_state = hidden_state[:, self.num_register_tokens + 1 :] + # this was actually a bug in the original implementation that we copied here, + # cause normally the order is height, width + batch_size, _, height, width = pixel_values.shape + patch_size = self.config.patch_size + hidden_state = hidden_state.reshape(batch_size, height // patch_size, width // patch_size, -1) + hidden_state = hidden_state.permute(0, 3, 1, 2).contiguous() + feature_maps += (hidden_state,) + + if not return_dict: + if output_hidden_states: + output = (feature_maps,) + outputs[1:] + else: + output = (feature_maps,) + outputs[2:] + return output + + return BackboneOutput( + feature_maps=feature_maps, + hidden_states=outputs.hidden_states if output_hidden_states else None, + attentions=outputs.attentions if output_attentions else None, + ) + + +__all__ = [ + "Dinov2WithRegistersPreTrainedModel", + "Dinov2WithRegistersModel", + "Dinov2WithRegistersForImageClassification", + "Dinov2WithRegistersBackbone", +] diff --git a/transformers/src/transformers/models/dinov2_with_registers/modular_dinov2_with_registers.py b/transformers/src/transformers/models/dinov2_with_registers/modular_dinov2_with_registers.py new file mode 100644 index 0000000000000000000000000000000000000000..dd15ee75b8725158f68f26b26c7583ddd081b983 --- /dev/null +++ b/transformers/src/transformers/models/dinov2_with_registers/modular_dinov2_with_registers.py @@ -0,0 +1,488 @@ +# coding=utf-8 +# Copyright 2024 Meta Inc. and the HuggingFace Inc. team. All rights reserved. +# +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Optional, Union + +import torch +import torch.utils.checkpoint +from torch import nn +from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss + +from ....transformers.models.dinov2.modeling_dinov2 import ( + Dinov2Backbone, + Dinov2Encoder, + Dinov2ForImageClassification, + Dinov2Model, + Dinov2PatchEmbeddings, + Dinov2PreTrainedModel, +) +from ...configuration_utils import PretrainedConfig +from ...modeling_outputs import BackboneOutput, ImageClassifierOutput +from ...utils import logging, torch_int +from ...utils.backbone_utils import BackboneConfigMixin, get_aligned_output_features_output_indices + + +logger = logging.get_logger(__name__) + + +class Dinov2WithRegistersConfig(BackboneConfigMixin, PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`Dinov2WithRegistersModel`]. It is used to instantiate an + Dinov2WithRegisters model according to the specified arguments, defining the model architecture. Instantiating a configuration + with the defaults will yield a similar configuration to that of the DINOv2 with Registers + [facebook/dinov2-with-registers-base](https://huggingface.co/facebook/dinov2-with-registers-base) architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + Args: + hidden_size (`int`, *optional*, defaults to 768): + Dimensionality of the encoder layers and the pooler layer. + num_hidden_layers (`int`, *optional*, defaults to 12): + Number of hidden layers in the Transformer encoder. + num_attention_heads (`int`, *optional*, defaults to 12): + Number of attention heads for each attention layer in the Transformer encoder. + mlp_ratio (`int`, *optional*, defaults to 4): + Ratio of the hidden size of the MLPs relative to the `hidden_size`. + hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`): + The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`, + `"relu"`, `"selu"` and `"gelu_new"` are supported. + hidden_dropout_prob (`float`, *optional*, defaults to 0.0): + The dropout probability for all fully connected layers in the embeddings, encoder, and pooler. + attention_probs_dropout_prob (`float`, *optional*, defaults to 0.0): + The dropout ratio for the attention probabilities. + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + layer_norm_eps (`float`, *optional*, defaults to 1e-06): + The epsilon used by the layer normalization layers. + image_size (`int`, *optional*, defaults to 224): + The size (resolution) of each image. + patch_size (`int`, *optional*, defaults to 16): + The size (resolution) of each patch. + num_channels (`int`, *optional*, defaults to 3): + The number of input channels. + qkv_bias (`bool`, *optional*, defaults to `True`): + Whether to add a bias to the queries, keys and values. + layerscale_value (`float`, *optional*, defaults to 1.0): + Initial value to use for layer scale. + drop_path_rate (`float`, *optional*, defaults to 0.0): + Stochastic depth rate per sample (when applied in the main path of residual layers). + use_swiglu_ffn (`bool`, *optional*, defaults to `False`): + Whether to use the SwiGLU feedforward neural network. + num_register_tokens (`int`, *optional*, defaults to 4): + Number of register tokens to use. + out_features (`list[str]`, *optional*): + If used as backbone, list of features to output. Can be any of `"stem"`, `"stage1"`, `"stage2"`, etc. + (depending on how many stages the model has). If unset and `out_indices` is set, will default to the + corresponding stages. If unset and `out_indices` is unset, will default to the last stage. Must be in the + same order as defined in the `stage_names` attribute. + out_indices (`list[int]`, *optional*): + If used as backbone, list of indices of features to output. Can be any of 0, 1, 2, etc. (depending on how + many stages the model has). If unset and `out_features` is set, will default to the corresponding stages. + If unset and `out_features` is unset, will default to the last stage. Must be in the + same order as defined in the `stage_names` attribute. + apply_layernorm (`bool`, *optional*, defaults to `True`): + Whether to apply layer normalization to the feature maps in case the model is used as backbone. + reshape_hidden_states (`bool`, *optional*, defaults to `True`): + Whether to reshape the feature maps to 4D tensors of shape `(batch_size, hidden_size, height, width)` in + case the model is used as backbone. If `False`, the feature maps will be 3D tensors of shape `(batch_size, + seq_len, hidden_size)`. + + Example: + + ```python + >>> from transformers import Dinov2WithRegistersConfig, Dinov2WithRegistersModel + + >>> # Initializing a Dinov2WithRegisters base style configuration + >>> configuration = Dinov2WithRegistersConfig() + + >>> # Initializing a model (with random weights) from the base style configuration + >>> model = Dinov2WithRegistersModel(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + + model_type = "dinov2_with_registers" + + def __init__( + self, + hidden_size=768, + num_hidden_layers=12, + num_attention_heads=12, + mlp_ratio=4, + hidden_act="gelu", + hidden_dropout_prob=0.0, + attention_probs_dropout_prob=0.0, + initializer_range=0.02, + layer_norm_eps=1e-6, + image_size=224, + patch_size=16, + num_channels=3, + qkv_bias=True, + layerscale_value=1.0, + drop_path_rate=0.0, + use_swiglu_ffn=False, + num_register_tokens=4, + out_features=None, + out_indices=None, + apply_layernorm=True, + reshape_hidden_states=True, + **kwargs, + ): + super().__init__(**kwargs) + + self.hidden_size = hidden_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.mlp_ratio = mlp_ratio + self.hidden_act = hidden_act + self.hidden_dropout_prob = hidden_dropout_prob + self.attention_probs_dropout_prob = attention_probs_dropout_prob + self.initializer_range = initializer_range + self.layer_norm_eps = layer_norm_eps + self.image_size = image_size + self.patch_size = patch_size + self.num_channels = num_channels + self.qkv_bias = qkv_bias + self.layerscale_value = layerscale_value + self.drop_path_rate = drop_path_rate + self.use_swiglu_ffn = use_swiglu_ffn + self.num_register_tokens = num_register_tokens + self.stage_names = ["stem"] + [f"stage{idx}" for idx in range(1, num_hidden_layers + 1)] + self._out_features, self._out_indices = get_aligned_output_features_output_indices( + out_features=out_features, out_indices=out_indices, stage_names=self.stage_names + ) + self.apply_layernorm = apply_layernorm + self.reshape_hidden_states = reshape_hidden_states + + +class Dinov2WithRegistersPatchEmbeddings(Dinov2PatchEmbeddings): + pass + + +class Dinov2WithRegistersEmbeddings(nn.Module): + """ + Construct the CLS token, mask token, register tokens, position and patch embeddings. + """ + + def __init__(self, config: Dinov2WithRegistersConfig) -> None: + super().__init__() + + self.cls_token = nn.Parameter(torch.randn(1, 1, config.hidden_size)) + self.mask_token = nn.Parameter(torch.zeros(1, config.hidden_size)) + self.register_tokens = nn.Parameter(torch.zeros(1, config.num_register_tokens, config.hidden_size)) + self.patch_embeddings = Dinov2WithRegistersPatchEmbeddings(config) + num_patches = self.patch_embeddings.num_patches + self.position_embeddings = nn.Parameter(torch.randn(1, num_patches + 1, config.hidden_size)) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + self.patch_size = config.patch_size + self.config = config + + def interpolate_pos_encoding(self, embeddings: torch.Tensor, height: int, width: int) -> torch.Tensor: + """ + This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher + resolution images. This implementation supports torch.jit tracing while maintaining backwards compatibility + with the original implementation. + + Adapted from: + - https://github.com/facebookresearch/dino/blob/main/vision_transformer.py + - https://github.com/facebookresearch/dinov2/blob/main/dinov2/models/vision_transformer.py + """ + num_patches = embeddings.shape[1] - 1 + num_positions = self.position_embeddings.shape[1] - 1 + + # Skip interpolation for matching dimensions (unless tracing) + if not torch.jit.is_tracing() and num_patches == num_positions and height == width: + return self.position_embeddings + + # Handle class token and patch embeddings separately + class_pos_embed = self.position_embeddings[:, 0] + patch_pos_embed = self.position_embeddings[:, 1:] + dim = embeddings.shape[-1] + + # Calculate new dimensions + height = height // self.config.patch_size + width = width // self.config.patch_size + + # Reshape for interpolation + sqrt_num_positions = torch_int(num_positions**0.5) + patch_pos_embed = patch_pos_embed.reshape(1, sqrt_num_positions, sqrt_num_positions, dim) + patch_pos_embed = patch_pos_embed.permute(0, 3, 1, 2) + + # Store original dtype for restoration after interpolation + target_dtype = patch_pos_embed.dtype + + # Interpolate at float32 precision + patch_pos_embed = nn.functional.interpolate( + patch_pos_embed.to(dtype=torch.float32), + size=(torch_int(height), torch_int(width)), # Explicit size instead of scale_factor + mode="bicubic", + align_corners=False, + antialias=True, + ).to(dtype=target_dtype) + + # Validate output dimensions if not tracing + if not torch.jit.is_tracing(): + if int(height) != patch_pos_embed.shape[-2] or int(width) != patch_pos_embed.shape[-1]: + raise ValueError("Width or height does not match with the interpolated position embeddings") + + # Reshape back to original format + patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).view(1, -1, dim) + + # Combine class and patch embeddings + return torch.cat((class_pos_embed.unsqueeze(0), patch_pos_embed), dim=1) + + def forward(self, pixel_values: torch.Tensor, bool_masked_pos: Optional[torch.Tensor] = None) -> torch.Tensor: + batch_size, _, height, width = pixel_values.shape + target_dtype = self.patch_embeddings.projection.weight.dtype + embeddings = self.patch_embeddings(pixel_values.to(dtype=target_dtype)) + + if bool_masked_pos is not None: + embeddings = torch.where( + bool_masked_pos.unsqueeze(-1), self.mask_token.to(embeddings.dtype).unsqueeze(0), embeddings + ) + + # add the [CLS] token to the embedded patch tokens + cls_tokens = self.cls_token.expand(batch_size, -1, -1) + embeddings = torch.cat((cls_tokens, embeddings), dim=1) + + # add positional encoding to each token + embeddings = embeddings + self.interpolate_pos_encoding(embeddings, height, width) + + # add register tokens + embeddings = torch.cat( + (embeddings[:, :1], self.register_tokens.expand(embeddings.shape[0], -1, -1), embeddings[:, 1:]), dim=1 + ) + + embeddings = self.dropout(embeddings) + + return embeddings + + +class Dinov2WithRegistersEncoder(Dinov2Encoder): + pass + + +class Dinov2WithRegistersPreTrainedModel(Dinov2PreTrainedModel): + def _init_weights(self, module: Union[nn.Linear, nn.Conv2d, nn.LayerNorm]) -> None: + """Initialize the weights""" + if isinstance(module, (nn.Linear, nn.Conv2d)): + # Upcast the input in `fp32` and cast it back to desired `dtype` to avoid + # `trunc_normal_cpu` not implemented in `half` issues + module.weight.data = nn.init.trunc_normal_( + module.weight.data.to(torch.float32), mean=0.0, std=self.config.initializer_range + ).to(module.weight.dtype) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.LayerNorm): + module.bias.data.zero_() + module.weight.data.fill_(1.0) + elif isinstance(module, Dinov2WithRegistersEmbeddings): + module.position_embeddings.data = nn.init.trunc_normal_( + module.position_embeddings.data.to(torch.float32), + mean=0.0, + std=self.config.initializer_range, + ).to(module.position_embeddings.dtype) + + module.cls_token.data = nn.init.trunc_normal_( + module.cls_token.data.to(torch.float32), + mean=0.0, + std=self.config.initializer_range, + ).to(module.cls_token.dtype) + + module.mask_token.data.zero_() + module.register_tokens.data.zero_() + elif isinstance(module, Dinov2WithRegistersLayerScale): # noqa: F821 + module.lambda1.data.fill_(self.config.layerscale_value) + + +class Dinov2WithRegistersModel(Dinov2Model): + pass + + +class Dinov2WithRegistersForImageClassification(Dinov2ForImageClassification): + def forward( + self, + pixel_values: Optional[torch.Tensor] = None, + head_mask: Optional[torch.Tensor] = None, + labels: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[tuple, ImageClassifierOutput]: + r""" + labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels for computing the image classification/regression loss. Indices should be in `[0, ..., + config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If + `config.num_labels > 1` a classification loss is computed (Cross-Entropy). + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + outputs = self.dinov2_with_registers( + pixel_values, + head_mask=head_mask, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + sequence_output = outputs[0] # batch_size, sequence_length, hidden_size + + cls_token = sequence_output[:, 0] + # cls and register tokens should not be included in patch tokens variable + patch_tokens = sequence_output[:, 1 + self.config.num_register_tokens :] + + linear_input = torch.cat([cls_token, patch_tokens.mean(dim=1)], dim=1) + + logits = self.classifier(linear_input) + + loss = None + if labels is not None: + # move labels to correct device to enable model parallelism + labels = labels.to(logits.device) + if self.config.problem_type is None: + if self.num_labels == 1: + self.config.problem_type = "regression" + elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int): + self.config.problem_type = "single_label_classification" + else: + self.config.problem_type = "multi_label_classification" + + if self.config.problem_type == "regression": + loss_fct = MSELoss() + if self.num_labels == 1: + loss = loss_fct(logits.squeeze(), labels.squeeze()) + else: + loss = loss_fct(logits, labels) + elif self.config.problem_type == "single_label_classification": + loss_fct = CrossEntropyLoss() + loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) + elif self.config.problem_type == "multi_label_classification": + loss_fct = BCEWithLogitsLoss() + loss = loss_fct(logits, labels) + + if not return_dict: + output = (logits,) + outputs[2:] + return ((loss,) + output) if loss is not None else output + + return ImageClassifierOutput( + loss=loss, + logits=logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +class Dinov2WithRegistersBackbone(Dinov2Backbone): + def __init__(self, config): + super().__init__(config) + super()._init_backbone(config) + + self.num_register_tokens = config.num_register_tokens + self.num_features = [config.hidden_size for _ in range(config.num_hidden_layers + 1)] + self.embeddings = Dinov2WithRegistersEmbeddings(config) + self.encoder = Dinov2WithRegistersEncoder(config) + + self.layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self) -> Dinov2WithRegistersPatchEmbeddings: + return self.embeddings.patch_embeddings + + def forward( + self, + pixel_values: torch.Tensor, + output_hidden_states: Optional[bool] = None, + output_attentions: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> BackboneOutput: + r""" + Examples: + + ```python + >>> from transformers import AutoImageProcessor, AutoBackbone + >>> import torch + >>> from PIL import Image + >>> import requests + + >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" + >>> image = Image.open(requests.get(url, stream=True).raw) + + >>> processor = AutoImageProcessor.from_pretrained("facebook/dinov2-with-registers-base") + >>> model = AutoBackbone.from_pretrained( + ... "facebook/dinov2-with-registers-base", out_features=["stage2", "stage5", "stage8", "stage11"] + ... ) + + >>> inputs = processor(image, return_tensors="pt") + + >>> outputs = model(**inputs) + >>> feature_maps = outputs.feature_maps + >>> list(feature_maps[-1].shape) + [1, 768, 16, 16] + ```""" + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + + embedding_output = self.embeddings(pixel_values) + + outputs = self.encoder( + embedding_output, output_hidden_states=True, output_attentions=output_attentions, return_dict=return_dict + ) + + hidden_states = outputs.hidden_states if return_dict else outputs[1] + + feature_maps = () + for stage, hidden_state in zip(self.stage_names, hidden_states): + if stage in self.out_features: + if self.config.apply_layernorm: + hidden_state = self.layernorm(hidden_state) + if self.config.reshape_hidden_states: + hidden_state = hidden_state[:, self.num_register_tokens + 1 :] + # this was actually a bug in the original implementation that we copied here, + # cause normally the order is height, width + batch_size, _, height, width = pixel_values.shape + patch_size = self.config.patch_size + hidden_state = hidden_state.reshape(batch_size, height // patch_size, width // patch_size, -1) + hidden_state = hidden_state.permute(0, 3, 1, 2).contiguous() + feature_maps += (hidden_state,) + + if not return_dict: + if output_hidden_states: + output = (feature_maps,) + outputs[1:] + else: + output = (feature_maps,) + outputs[2:] + return output + + return BackboneOutput( + feature_maps=feature_maps, + hidden_states=outputs.hidden_states if output_hidden_states else None, + attentions=outputs.attentions if output_attentions else None, + ) + + +__all__ = [ + "Dinov2WithRegistersConfig", + "Dinov2WithRegistersPreTrainedModel", + "Dinov2WithRegistersModel", + "Dinov2WithRegistersForImageClassification", + "Dinov2WithRegistersBackbone", +] diff --git a/transformers/src/transformers/models/distilbert/__init__.py b/transformers/src/transformers/models/distilbert/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..4d6fae2e0236e7619988f0cfa3502ed49d0f90b0 --- /dev/null +++ b/transformers/src/transformers/models/distilbert/__init__.py @@ -0,0 +1,31 @@ +# Copyright 2024 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import TYPE_CHECKING + +from ...utils import _LazyModule +from ...utils.import_utils import define_import_structure + + +if TYPE_CHECKING: + from .configuration_distilbert import * + from .modeling_distilbert import * + from .modeling_flax_distilbert import * + from .modeling_tf_distilbert import * + from .tokenization_distilbert import * + from .tokenization_distilbert_fast import * +else: + import sys + + _file = globals()["__file__"] + sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__) diff --git a/transformers/src/transformers/models/distilbert/configuration_distilbert.py b/transformers/src/transformers/models/distilbert/configuration_distilbert.py new file mode 100644 index 0000000000000000000000000000000000000000..0aa6d2dfd7c589c891204b8301d7677b097e29d8 --- /dev/null +++ b/transformers/src/transformers/models/distilbert/configuration_distilbert.py @@ -0,0 +1,141 @@ +# coding=utf-8 +# Copyright 2019-present, the HuggingFace Inc. team, The Google AI Language Team and Facebook, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""DistilBERT model configuration""" + +from collections import OrderedDict +from collections.abc import Mapping + +from ...configuration_utils import PretrainedConfig +from ...onnx import OnnxConfig +from ...utils import logging + + +logger = logging.get_logger(__name__) + + +class DistilBertConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`DistilBertModel`] or a [`TFDistilBertModel`]. It + is used to instantiate a DistilBERT model according to the specified arguments, defining the model architecture. + Instantiating a configuration with the defaults will yield a similar configuration to that of the DistilBERT + [distilbert-base-uncased](https://huggingface.co/distilbert-base-uncased) architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + Args: + vocab_size (`int`, *optional*, defaults to 30522): + Vocabulary size of the DistilBERT model. Defines the number of different tokens that can be represented by + the `inputs_ids` passed when calling [`DistilBertModel`] or [`TFDistilBertModel`]. + max_position_embeddings (`int`, *optional*, defaults to 512): + The maximum sequence length that this model might ever be used with. Typically set this to something large + just in case (e.g., 512 or 1024 or 2048). + sinusoidal_pos_embds (`boolean`, *optional*, defaults to `False`): + Whether to use sinusoidal positional embeddings. + n_layers (`int`, *optional*, defaults to 6): + Number of hidden layers in the Transformer encoder. + n_heads (`int`, *optional*, defaults to 12): + Number of attention heads for each attention layer in the Transformer encoder. + dim (`int`, *optional*, defaults to 768): + Dimensionality of the encoder layers and the pooler layer. + hidden_dim (`int`, *optional*, defaults to 3072): + The size of the "intermediate" (often named feed-forward) layer in the Transformer encoder. + dropout (`float`, *optional*, defaults to 0.1): + The dropout probability for all fully connected layers in the embeddings, encoder, and pooler. + attention_dropout (`float`, *optional*, defaults to 0.1): + The dropout ratio for the attention probabilities. + activation (`str` or `Callable`, *optional*, defaults to `"gelu"`): + The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`, + `"relu"`, `"silu"` and `"gelu_new"` are supported. + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + qa_dropout (`float`, *optional*, defaults to 0.1): + The dropout probabilities used in the question answering model [`DistilBertForQuestionAnswering`]. + seq_classif_dropout (`float`, *optional*, defaults to 0.2): + The dropout probabilities used in the sequence classification and the multiple choice model + [`DistilBertForSequenceClassification`]. + + Examples: + + ```python + >>> from transformers import DistilBertConfig, DistilBertModel + + >>> # Initializing a DistilBERT configuration + >>> configuration = DistilBertConfig() + + >>> # Initializing a model (with random weights) from the configuration + >>> model = DistilBertModel(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + + model_type = "distilbert" + attribute_map = { + "hidden_size": "dim", + "num_attention_heads": "n_heads", + "num_hidden_layers": "n_layers", + } + + def __init__( + self, + vocab_size=30522, + max_position_embeddings=512, + sinusoidal_pos_embds=False, + n_layers=6, + n_heads=12, + dim=768, + hidden_dim=4 * 768, + dropout=0.1, + attention_dropout=0.1, + activation="gelu", + initializer_range=0.02, + qa_dropout=0.1, + seq_classif_dropout=0.2, + pad_token_id=0, + **kwargs, + ): + self.vocab_size = vocab_size + self.max_position_embeddings = max_position_embeddings + self.sinusoidal_pos_embds = sinusoidal_pos_embds + self.n_layers = n_layers + self.n_heads = n_heads + self.dim = dim + self.hidden_dim = hidden_dim + self.dropout = dropout + self.attention_dropout = attention_dropout + self.activation = activation + self.initializer_range = initializer_range + self.qa_dropout = qa_dropout + self.seq_classif_dropout = seq_classif_dropout + super().__init__(**kwargs, pad_token_id=pad_token_id) + + +class DistilBertOnnxConfig(OnnxConfig): + @property + def inputs(self) -> Mapping[str, Mapping[int, str]]: + if self.task == "multiple-choice": + dynamic_axis = {0: "batch", 1: "choice", 2: "sequence"} + else: + dynamic_axis = {0: "batch", 1: "sequence"} + return OrderedDict( + [ + ("input_ids", dynamic_axis), + ("attention_mask", dynamic_axis), + ] + ) + + +__all__ = ["DistilBertConfig", "DistilBertOnnxConfig"] diff --git a/transformers/src/transformers/models/distilbert/modeling_distilbert.py b/transformers/src/transformers/models/distilbert/modeling_distilbert.py new file mode 100644 index 0000000000000000000000000000000000000000..a047a676be09608933042051ca401c354e6aa43c --- /dev/null +++ b/transformers/src/transformers/models/distilbert/modeling_distilbert.py @@ -0,0 +1,1300 @@ +# coding=utf-8 +# Copyright 2019-present, the HuggingFace Inc. team, The Google AI Language Team and Facebook, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +PyTorch DistilBERT model adapted in part from Facebook, Inc XLM model (https://github.com/facebookresearch/XLM) and in +part from HuggingFace PyTorch version of Google AI Bert model (https://github.com/google-research/bert) +""" + +import math +from typing import Optional, Union + +import numpy as np +import torch +from torch import nn +from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss + +from ...activations import get_activation +from ...configuration_utils import PretrainedConfig +from ...integrations.deepspeed import is_deepspeed_zero3_enabled +from ...modeling_attn_mask_utils import _prepare_4d_attention_mask_for_sdpa +from ...modeling_flash_attention_utils import flash_attn_supports_top_left_mask, is_flash_attn_available +from ...modeling_layers import GradientCheckpointingLayer +from ...modeling_outputs import ( + BaseModelOutput, + MaskedLMOutput, + MultipleChoiceModelOutput, + QuestionAnsweringModelOutput, + SequenceClassifierOutput, + TokenClassifierOutput, +) +from ...modeling_utils import PreTrainedModel +from ...pytorch_utils import ( + apply_chunking_to_forward, + find_pruneable_heads_and_indices, + is_torch_greater_or_equal_than_2_2, + prune_linear_layer, +) +from ...utils import ( + auto_docstring, + logging, +) +from .configuration_distilbert import DistilBertConfig + + +if is_flash_attn_available(): + from ...modeling_flash_attention_utils import _flash_attention_forward + + +logger = logging.get_logger(__name__) + + +# UTILS AND BUILDING BLOCKS OF THE ARCHITECTURE # + + +def create_sinusoidal_embeddings(n_pos: int, dim: int, out: torch.Tensor): + if is_deepspeed_zero3_enabled(): + import deepspeed + + with deepspeed.zero.GatheredParameters(out, modifier_rank=0): + if torch.distributed.get_rank() == 0: + _create_sinusoidal_embeddings(n_pos=n_pos, dim=dim, out=out) + else: + _create_sinusoidal_embeddings(n_pos=n_pos, dim=dim, out=out) + + +def _create_sinusoidal_embeddings(n_pos: int, dim: int, out: torch.Tensor): + position_enc = np.array([[pos / np.power(10000, 2 * (j // 2) / dim) for j in range(dim)] for pos in range(n_pos)]) + out.requires_grad = False + out[:, 0::2] = torch.FloatTensor(np.sin(position_enc[:, 0::2])) + out[:, 1::2] = torch.FloatTensor(np.cos(position_enc[:, 1::2])) + out.detach_() + + +class Embeddings(nn.Module): + def __init__(self, config: PretrainedConfig): + super().__init__() + self.word_embeddings = nn.Embedding(config.vocab_size, config.dim, padding_idx=config.pad_token_id) + self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.dim) + + self.LayerNorm = nn.LayerNorm(config.dim, eps=1e-12) + self.dropout = nn.Dropout(config.dropout) + self.register_buffer( + "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False + ) + + def forward(self, input_ids: torch.Tensor, input_embeds: Optional[torch.Tensor] = None) -> torch.Tensor: + """ + Parameters: + input_ids (torch.Tensor): + torch.tensor(bs, max_seq_length) The token ids to embed. + input_embeds (*optional*, torch.Tensor): + The pre-computed word embeddings. Can only be passed if the input ids are `None`. + + + Returns: torch.tensor(bs, max_seq_length, dim) The embedded tokens (plus position embeddings, no token_type + embeddings) + """ + if input_ids is not None: + input_embeds = self.word_embeddings(input_ids) # (bs, max_seq_length, dim) + + seq_length = input_embeds.size(1) + + # Setting the position-ids to the registered buffer in constructor, it helps + # when tracing the model without passing position-ids, solves + # issues similar to issue #5664 + if hasattr(self, "position_ids"): + position_ids = self.position_ids[:, :seq_length] + else: + position_ids = torch.arange(seq_length, dtype=torch.long, device=input_ids.device) # (max_seq_length) + position_ids = position_ids.unsqueeze(0).expand_as(input_ids) # (bs, max_seq_length) + + position_embeddings = self.position_embeddings(position_ids) # (bs, max_seq_length, dim) + + embeddings = input_embeds + position_embeddings # (bs, max_seq_length, dim) + embeddings = self.LayerNorm(embeddings) # (bs, max_seq_length, dim) + embeddings = self.dropout(embeddings) # (bs, max_seq_length, dim) + return embeddings + + +class MultiHeadSelfAttention(nn.Module): + def __init__(self, config: PretrainedConfig): + super().__init__() + self.config = config + + self.n_heads = config.n_heads + self.dim = config.dim + self.dropout = nn.Dropout(p=config.attention_dropout) + self.is_causal = False + + # Have an even number of multi heads that divide the dimensions + if self.dim % self.n_heads != 0: + # Raise value errors for even multi-head attention nodes + raise ValueError(f"self.n_heads: {self.n_heads} must divide self.dim: {self.dim} evenly") + + self.q_lin = nn.Linear(in_features=config.dim, out_features=config.dim) + self.k_lin = nn.Linear(in_features=config.dim, out_features=config.dim) + self.v_lin = nn.Linear(in_features=config.dim, out_features=config.dim) + self.out_lin = nn.Linear(in_features=config.dim, out_features=config.dim) + + self.pruned_heads: set[int] = set() + self.attention_head_size = self.dim // self.n_heads + + def prune_heads(self, heads: list[int]): + if len(heads) == 0: + return + heads, index = find_pruneable_heads_and_indices( + heads, self.n_heads, self.attention_head_size, self.pruned_heads + ) + # Prune linear layers + self.q_lin = prune_linear_layer(self.q_lin, index) + self.k_lin = prune_linear_layer(self.k_lin, index) + self.v_lin = prune_linear_layer(self.v_lin, index) + self.out_lin = prune_linear_layer(self.out_lin, index, dim=1) + # Update hyper params + self.n_heads = self.n_heads - len(heads) + self.dim = self.attention_head_size * self.n_heads + self.pruned_heads = self.pruned_heads.union(heads) + + def forward( + self, + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + mask: torch.Tensor, + head_mask: Optional[torch.Tensor] = None, + output_attentions: bool = False, + ) -> tuple[torch.Tensor, ...]: + """ + Parameters: + query: torch.tensor(bs, seq_length, dim) + key: torch.tensor(bs, seq_length, dim) + value: torch.tensor(bs, seq_length, dim) + mask: torch.tensor(bs, seq_length) + + Returns: + weights: torch.tensor(bs, n_heads, seq_length, seq_length) Attention weights context: torch.tensor(bs, + seq_length, dim) Contextualized layer. Optional: only if `output_attentions=True` + """ + bs, q_length, dim = query.size() + k_length = key.size(1) + # assert dim == self.dim, f'Dimensions do not match: {dim} input vs {self.dim} configured' + # assert key.size() == value.size() + + dim_per_head = self.dim // self.n_heads + + mask_reshp = (bs, 1, 1, k_length) + + def shape(x: torch.Tensor) -> torch.Tensor: + """separate heads""" + return x.view(bs, -1, self.n_heads, dim_per_head).transpose(1, 2) + + def unshape(x: torch.Tensor) -> torch.Tensor: + """group heads""" + return x.transpose(1, 2).contiguous().view(bs, -1, self.n_heads * dim_per_head) + + q = shape(self.q_lin(query)) # (bs, n_heads, q_length, dim_per_head) + k = shape(self.k_lin(key)) # (bs, n_heads, k_length, dim_per_head) + v = shape(self.v_lin(value)) # (bs, n_heads, k_length, dim_per_head) + + q = q / math.sqrt(dim_per_head) # (bs, n_heads, q_length, dim_per_head) + scores = torch.matmul(q, k.transpose(2, 3)) # (bs, n_heads, q_length, k_length) + mask = (mask == 0).view(mask_reshp).expand_as(scores) # (bs, n_heads, q_length, k_length) + scores = scores.masked_fill( + mask, torch.tensor(torch.finfo(scores.dtype).min) + ) # (bs, n_heads, q_length, k_length) + + weights = nn.functional.softmax(scores, dim=-1) # (bs, n_heads, q_length, k_length) + weights = self.dropout(weights) # (bs, n_heads, q_length, k_length) + + # Mask heads if we want to + if head_mask is not None: + weights = weights * head_mask + + context = torch.matmul(weights, v) # (bs, n_heads, q_length, dim_per_head) + context = unshape(context) # (bs, q_length, dim) + context = self.out_lin(context) # (bs, q_length, dim) + + if output_attentions: + return (context, weights) + else: + return (context,) + + +class DistilBertFlashAttention2(MultiHeadSelfAttention): + """ + DistilBert flash attention module. This module inherits from `MultiHeadSelfAttention` as the weights of the module + stays untouched. The only required change would be on the forward pass where it needs to correctly call the public + API of flash attention and deal with padding tokens in case the input contains any of them. + """ + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1. + # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignment, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0. + # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left). + self._flash_attn_uses_top_left_mask = flash_attn_supports_top_left_mask() + + def forward( + self, + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + mask: torch.Tensor, + head_mask: Optional[torch.Tensor] = None, + output_attentions: bool = False, + ) -> tuple[torch.Tensor, ...]: + """ + Parameters: + query: torch.tensor(bs, seq_length, dim) + key: torch.tensor(bs, seq_length, dim) + value: torch.tensor(bs, seq_length, dim) + mask: torch.tensor(bs, seq_length) + + Returns: + weights: torch.tensor(bs, n_heads, seq_length, seq_length) Attention weights context: torch.tensor(bs, + seq_length, dim) Contextualized layer. Optional: only if `output_attentions=True` + """ + batch_size, q_length, dim = query.size() + + dim_per_head = self.dim // self.n_heads + + def reshape(x: torch.Tensor) -> torch.Tensor: + """separate heads""" + return x.view(batch_size, -1, self.n_heads, dim_per_head) + + # Flash attention requires the input to have the shape + # batch_size x seq_length x head_dim x hidden_dim + query_states = reshape(self.q_lin(query)) + key_states = reshape(self.k_lin(key)) + value_states = reshape(self.v_lin(value)) + + attn_dropout = self.config.attention_dropout if self.training else 0.0 + + # In PEFT, usually we cast the layer norms in float32 for training stability reasons + # therefore the input hidden states gets silently casted in float32. Hence, we need + # cast them back in the correct dtype just to be sure everything works as expected. + # This might slowdown training & inference so it is recommended to not cast the LayerNorms + # in fp32. (LlamaRMSNorm handles it correctly) + + device_type = query_states.device.type if query_states.device.type != "mps" else "cpu" + if query_states.dtype == torch.float32: + if torch.is_autocast_enabled(): + target_dtype = ( + torch.get_autocast_dtype(device_type) + if hasattr(torch, "get_autocast_dtype") + else torch.get_autocast_gpu_dtype() + ) + # Handle the case where the model is quantized + elif hasattr(self.config, "_pre_quantization_dtype"): + target_dtype = self.config._pre_quantization_dtype + else: + target_dtype = self.q_lin.weight.dtype + + logger.warning_once( + f"The input hidden states seems to be silently casted in float32, this might be related to" + f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in" + f" {target_dtype}." + ) + + query_states = query_states.to(target_dtype) + key_states = key_states.to(target_dtype) + value_states = value_states.to(target_dtype) + + attn_weights = _flash_attention_forward( + query_states, + key_states, + value_states, + mask, + q_length, + dropout=attn_dropout, + use_top_left_mask=self._flash_attn_uses_top_left_mask, + is_causal=self.is_causal, + ) + + attn_weights_reshaped = attn_weights.reshape(batch_size, q_length, self.n_heads * dim_per_head) + attn_output = self.out_lin(attn_weights_reshaped) + + if output_attentions: + return (attn_output, attn_weights) + else: + return (attn_output,) + + +class DistilBertSdpaAttention(MultiHeadSelfAttention): + def __init__(self, config: PretrainedConfig): + super().__init__(config=config) + self.dropout_prob = config.attention_dropout + self.require_contiguous_qkv = not is_torch_greater_or_equal_than_2_2 + + def forward( + self, + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + mask: torch.Tensor, + head_mask: Optional[torch.Tensor] = None, + output_attentions: bool = False, + ) -> tuple[torch.Tensor, ...]: + """ + Parameters: + query: torch.tensor(bs, seq_length, dim) + key: torch.tensor(bs, seq_length, dim) + value: torch.tensor(bs, seq_length, dim) + mask: torch.tensor(bs, seq_length) + + Returns: + weights: torch.tensor(bs, n_heads, seq_length, seq_length) Attention weights context: torch.tensor(bs, + seq_length, dim) Contextualized layer. Optional: only if `output_attentions=True` + """ + if output_attentions or head_mask is not None: + logger.warning_once( + "DistilBertSdpaAttention is used but `torch.nn.functional.scaled_dot_product_attention` does not support" + " `output_attentions=True` or `head_mask`. Falling back to the manual attention implementation, but specifying" + " the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be" + ' removed using the argument `attn_implementation="eager"` when loading the model.' + ) + return super().forward( + query, + key, + value, + mask, + head_mask, + output_attentions, + ) + + batch_size, _, _ = query.size() + dim_per_head = self.dim // self.n_heads + + def shape(x: torch.Tensor) -> torch.Tensor: + """separate heads""" + return x.view(batch_size, -1, self.n_heads, dim_per_head).transpose(1, 2) + + def unshape(x: torch.Tensor) -> torch.Tensor: + """group heads""" + return x.transpose(1, 2).contiguous().view(batch_size, -1, self.n_heads * dim_per_head) + + q = shape(self.q_lin(query)) # (bs, n_heads, q_length, dim_per_head) + k = shape(self.k_lin(key)) # (bs, n_heads, k_length, dim_per_head) + v = shape(self.v_lin(value)) # (bs, n_heads, k_length, dim_per_head) + + # SDPA with memory-efficient backend is broken in torch==2.1.2 when using non-contiguous inputs and a custom + # attn_mask, so we need to call `.contiguous()` here. This was fixed in torch==2.2.0. + # Reference: https://github.com/pytorch/pytorch/issues/112577 + if self.require_contiguous_qkv and q.device.type == "cuda" and mask is not None: + q = q.contiguous() + k = k.contiguous() + v = v.contiguous() + + attn_output = torch.nn.functional.scaled_dot_product_attention( + q, + k, + v, + attn_mask=mask, + dropout_p=self.dropout_prob if self.training else 0.0, + is_causal=False, + ) + + attn_output = unshape(attn_output) + attn_output = self.out_lin(attn_output) + + return (attn_output,) + + +class FFN(nn.Module): + def __init__(self, config: PretrainedConfig): + super().__init__() + self.dropout = nn.Dropout(p=config.dropout) + self.chunk_size_feed_forward = config.chunk_size_feed_forward + self.seq_len_dim = 1 + self.lin1 = nn.Linear(in_features=config.dim, out_features=config.hidden_dim) + self.lin2 = nn.Linear(in_features=config.hidden_dim, out_features=config.dim) + self.activation = get_activation(config.activation) + + def forward(self, input: torch.Tensor) -> torch.Tensor: + return apply_chunking_to_forward(self.ff_chunk, self.chunk_size_feed_forward, self.seq_len_dim, input) + + def ff_chunk(self, input: torch.Tensor) -> torch.Tensor: + x = self.lin1(input) + x = self.activation(x) + x = self.lin2(x) + x = self.dropout(x) + return x + + +DISTILBERT_ATTENTION_CLASSES = { + "eager": MultiHeadSelfAttention, + "flash_attention_2": DistilBertFlashAttention2, + "sdpa": DistilBertSdpaAttention, +} + + +class TransformerBlock(GradientCheckpointingLayer): + def __init__(self, config: PretrainedConfig): + super().__init__() + + # Have an even number of Configure multi-heads + if config.dim % config.n_heads != 0: + raise ValueError(f"config.n_heads {config.n_heads} must divide config.dim {config.dim} evenly") + + self.attention = DISTILBERT_ATTENTION_CLASSES[config._attn_implementation](config) + self.sa_layer_norm = nn.LayerNorm(normalized_shape=config.dim, eps=1e-12) + + self.ffn = FFN(config) + self.output_layer_norm = nn.LayerNorm(normalized_shape=config.dim, eps=1e-12) + + def forward( + self, + x: torch.Tensor, + attn_mask: Optional[torch.Tensor] = None, + head_mask: Optional[torch.Tensor] = None, + output_attentions: bool = False, + ) -> tuple[torch.Tensor, ...]: + """ + Parameters: + x: torch.tensor(bs, seq_length, dim) + attn_mask: torch.tensor(bs, seq_length) + + Returns: + sa_weights: torch.tensor(bs, n_heads, seq_length, seq_length) The attention weights ffn_output: + torch.tensor(bs, seq_length, dim) The output of the transformer block contextualization. + """ + # Self-Attention + sa_output = self.attention( + query=x, + key=x, + value=x, + mask=attn_mask, + head_mask=head_mask, + output_attentions=output_attentions, + ) + if output_attentions: + sa_output, sa_weights = sa_output # (bs, seq_length, dim), (bs, n_heads, seq_length, seq_length) + else: # To handle these `output_attentions` or `output_hidden_states` cases returning tuples + if type(sa_output) is not tuple: + raise TypeError(f"sa_output must be a tuple but it is {type(sa_output)} type") + + sa_output = sa_output[0] + sa_output = self.sa_layer_norm(sa_output + x) # (bs, seq_length, dim) + + # Feed Forward Network + ffn_output = self.ffn(sa_output) # (bs, seq_length, dim) + ffn_output: torch.Tensor = self.output_layer_norm(ffn_output + sa_output) # (bs, seq_length, dim) + + output = (ffn_output,) + if output_attentions: + output = (sa_weights,) + output + return output + + +class Transformer(nn.Module): + def __init__(self, config: PretrainedConfig): + super().__init__() + self.n_layers = config.n_layers + self.layer = nn.ModuleList([TransformerBlock(config) for _ in range(config.n_layers)]) + self.gradient_checkpointing = False + + def forward( + self, + x: torch.Tensor, + attn_mask: Optional[torch.Tensor] = None, + head_mask: Optional[torch.Tensor] = None, + output_attentions: bool = False, + output_hidden_states: bool = False, + return_dict: Optional[bool] = None, + ) -> Union[BaseModelOutput, tuple[torch.Tensor, ...]]: # docstyle-ignore + """ + Parameters: + x: torch.tensor(bs, seq_length, dim) Input sequence embedded. + attn_mask: torch.tensor(bs, seq_length) Attention mask on the sequence. + + Returns: + hidden_state: torch.tensor(bs, seq_length, dim) Sequence of hidden states in the last (top) + layer all_hidden_states: tuple[torch.tensor(bs, seq_length, dim)] + Tuple of length n_layers with the hidden states from each layer. + Optional: only if output_hidden_states=True + all_attentions: tuple[torch.tensor(bs, n_heads, seq_length, seq_length)] + Tuple of length n_layers with the attention weights from each layer + Optional: only if output_attentions=True + """ + all_hidden_states = () if output_hidden_states else None + all_attentions = () if output_attentions else None + + hidden_state = x + for i, layer_module in enumerate(self.layer): + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_state,) + + layer_outputs = layer_module( + hidden_state, + attn_mask, + head_mask[i], + output_attentions, + ) + + hidden_state = layer_outputs[-1] + + if output_attentions: + if len(layer_outputs) != 2: + raise ValueError(f"The length of the layer_outputs should be 2, but it is {len(layer_outputs)}") + + attentions = layer_outputs[0] + all_attentions = all_attentions + (attentions,) + else: + if len(layer_outputs) != 1: + raise ValueError(f"The length of the layer_outputs should be 1, but it is {len(layer_outputs)}") + + # Add last layer + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_state,) + + if not return_dict: + return tuple(v for v in [hidden_state, all_hidden_states, all_attentions] if v is not None) + return BaseModelOutput( + last_hidden_state=hidden_state, hidden_states=all_hidden_states, attentions=all_attentions + ) + + +# INTERFACE FOR ENCODER AND TASK SPECIFIC MODEL # +@auto_docstring +class DistilBertPreTrainedModel(PreTrainedModel): + config_class = DistilBertConfig + load_tf_weights = None + base_model_prefix = "distilbert" + supports_gradient_checkpointing = True + _supports_flash_attn_2 = True + _supports_flash_attn_3 = True + _supports_sdpa = True + + def _init_weights(self, module: nn.Module): + """Initialize the weights.""" + if isinstance(module, nn.Linear): + # Slightly different from the TF version which uses truncated_normal for initialization + # cf https://github.com/pytorch/pytorch/pull/5617 + module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.Embedding): + module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + if module.padding_idx is not None: + module.weight.data[module.padding_idx].zero_() + elif isinstance(module, nn.LayerNorm): + module.bias.data.zero_() + module.weight.data.fill_(1.0) + elif isinstance(module, Embeddings) and self.config.sinusoidal_pos_embds: + create_sinusoidal_embeddings( + self.config.max_position_embeddings, self.config.dim, module.position_embeddings.weight + ) + + +@auto_docstring +class DistilBertModel(DistilBertPreTrainedModel): + def __init__(self, config: PretrainedConfig): + super().__init__(config) + + self.embeddings = Embeddings(config) # Embeddings + self.transformer = Transformer(config) # Encoder + self._use_flash_attention_2 = config._attn_implementation == "flash_attention_2" + self._use_sdpa = config._attn_implementation == "sdpa" + + # Initialize weights and apply final processing + self.post_init() + + def get_position_embeddings(self) -> nn.Embedding: + """ + Returns the position embeddings + """ + return self.embeddings.position_embeddings + + def resize_position_embeddings(self, new_num_position_embeddings: int): + """ + Resizes position embeddings of the model if `new_num_position_embeddings != config.max_position_embeddings`. + + Arguments: + new_num_position_embeddings (`int`): + The number of new position embedding matrix. If position embeddings are learned, increasing the size + will add newly initialized vectors at the end, whereas reducing the size will remove vectors from the + end. If position embeddings are not learned (*e.g.* sinusoidal position embeddings), increasing the + size will add correct vectors at the end following the position encoding algorithm, whereas reducing + the size will remove vectors from the end. + """ + num_position_embeds_diff = new_num_position_embeddings - self.config.max_position_embeddings + + # no resizing needs to be done if the length stays the same + if num_position_embeds_diff == 0: + return + + logger.info(f"Setting `config.max_position_embeddings={new_num_position_embeddings}`...") + self.config.max_position_embeddings = new_num_position_embeddings + + old_position_embeddings_weight = self.embeddings.position_embeddings.weight.clone() + + self.embeddings.position_embeddings = nn.Embedding(self.config.max_position_embeddings, self.config.dim) + + if self.config.sinusoidal_pos_embds: + create_sinusoidal_embeddings( + n_pos=self.config.max_position_embeddings, dim=self.config.dim, out=self.position_embeddings.weight + ) + else: + with torch.no_grad(): + if num_position_embeds_diff > 0: + self.embeddings.position_embeddings.weight[:-num_position_embeds_diff] = nn.Parameter( + old_position_embeddings_weight + ) + else: + self.embeddings.position_embeddings.weight = nn.Parameter( + old_position_embeddings_weight[:num_position_embeds_diff] + ) + # move position_embeddings to correct device + self.embeddings.position_embeddings.to(self.device) + + def get_input_embeddings(self) -> nn.Embedding: + return self.embeddings.word_embeddings + + def set_input_embeddings(self, new_embeddings: nn.Embedding): + self.embeddings.word_embeddings = new_embeddings + + def _prune_heads(self, heads_to_prune: dict[int, list[list[int]]]): + """ + Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base + class PreTrainedModel + """ + for layer, heads in heads_to_prune.items(): + self.transformer.layer[layer].attention.prune_heads(heads) + + @auto_docstring + def forward( + self, + input_ids: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + head_mask: Optional[torch.Tensor] = None, + inputs_embeds: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[BaseModelOutput, tuple[torch.Tensor, ...]]: + r""" + input_ids (`torch.LongTensor` of shape `(batch_size, num_choices)`): + Indices of input sequence tokens in the vocabulary. + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + [What are input IDs?](../glossary#input-ids) + inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_choices, hidden_size)`, *optional*): + Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This + is useful if you want more control over how to convert `input_ids` indices into associated vectors than the + model's internal embedding lookup matrix. + """ + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if input_ids is not None and inputs_embeds is not None: + raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") + elif input_ids is not None: + self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask) + input_shape = input_ids.size() + elif inputs_embeds is not None: + input_shape = inputs_embeds.size()[:-1] + else: + raise ValueError("You have to specify either input_ids or inputs_embeds") + + device = input_ids.device if input_ids is not None else inputs_embeds.device + + head_mask_is_none = head_mask is None + # Prepare head mask if needed + head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers) + + embeddings = self.embeddings(input_ids, inputs_embeds) # (bs, seq_length, dim) + + if self._use_flash_attention_2: + attention_mask = attention_mask if (attention_mask is not None and 0 in attention_mask) else None + else: + if attention_mask is None: + attention_mask = torch.ones(input_shape, device=device) # (bs, seq_length) + + if self._use_sdpa and head_mask_is_none and not output_attentions: + attention_mask = _prepare_4d_attention_mask_for_sdpa( + attention_mask, embeddings.dtype, tgt_len=input_shape[1] + ) + + return self.transformer( + x=embeddings, + attn_mask=attention_mask, + head_mask=head_mask, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + +@auto_docstring( + custom_intro=""" + DistilBert Model with a `masked language modeling` head on top. + """ +) +class DistilBertForMaskedLM(DistilBertPreTrainedModel): + _tied_weights_keys = ["vocab_projector.weight"] + + def __init__(self, config: PretrainedConfig): + super().__init__(config) + + self.activation = get_activation(config.activation) + + self.distilbert = DistilBertModel(config) + self.vocab_transform = nn.Linear(config.dim, config.dim) + self.vocab_layer_norm = nn.LayerNorm(config.dim, eps=1e-12) + self.vocab_projector = nn.Linear(config.dim, config.vocab_size) + + # Initialize weights and apply final processing + self.post_init() + + self.mlm_loss_fct = nn.CrossEntropyLoss() + + def get_position_embeddings(self) -> nn.Embedding: + """ + Returns the position embeddings + """ + return self.distilbert.get_position_embeddings() + + def resize_position_embeddings(self, new_num_position_embeddings: int): + """ + Resizes position embeddings of the model if `new_num_position_embeddings != config.max_position_embeddings`. + + Arguments: + new_num_position_embeddings (`int`): + The number of new position embedding matrix. If position embeddings are learned, increasing the size + will add newly initialized vectors at the end, whereas reducing the size will remove vectors from the + end. If position embeddings are not learned (*e.g.* sinusoidal position embeddings), increasing the + size will add correct vectors at the end following the position encoding algorithm, whereas reducing + the size will remove vectors from the end. + """ + self.distilbert.resize_position_embeddings(new_num_position_embeddings) + + def get_output_embeddings(self) -> nn.Module: + return self.vocab_projector + + def set_output_embeddings(self, new_embeddings: nn.Module): + self.vocab_projector = new_embeddings + + @auto_docstring + def forward( + self, + input_ids: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + head_mask: Optional[torch.Tensor] = None, + inputs_embeds: Optional[torch.Tensor] = None, + labels: Optional[torch.LongTensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[MaskedLMOutput, tuple[torch.Tensor, ...]]: + r""" + input_ids (`torch.LongTensor` of shape `(batch_size, num_choices)`): + Indices of input sequence tokens in the vocabulary. + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + [What are input IDs?](../glossary#input-ids) + inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_choices, hidden_size)`, *optional*): + Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This + is useful if you want more control over how to convert `input_ids` indices into associated vectors than the + model's internal embedding lookup matrix. + labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ..., + config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the + loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`. + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + dlbrt_output = self.distilbert( + input_ids=input_ids, + attention_mask=attention_mask, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + hidden_states = dlbrt_output[0] # (bs, seq_length, dim) + prediction_logits = self.vocab_transform(hidden_states) # (bs, seq_length, dim) + prediction_logits = self.activation(prediction_logits) # (bs, seq_length, dim) + prediction_logits = self.vocab_layer_norm(prediction_logits) # (bs, seq_length, dim) + prediction_logits = self.vocab_projector(prediction_logits) # (bs, seq_length, vocab_size) + + mlm_loss = None + if labels is not None: + mlm_loss = self.mlm_loss_fct(prediction_logits.view(-1, prediction_logits.size(-1)), labels.view(-1)) + + if not return_dict: + output = (prediction_logits,) + dlbrt_output[1:] + return ((mlm_loss,) + output) if mlm_loss is not None else output + + return MaskedLMOutput( + loss=mlm_loss, + logits=prediction_logits, + hidden_states=dlbrt_output.hidden_states, + attentions=dlbrt_output.attentions, + ) + + +@auto_docstring( + custom_intro=""" + DistilBert Model transformer with a sequence classification/regression head on top (a linear layer on top of the + pooled output) e.g. for GLUE tasks. + """ +) +class DistilBertForSequenceClassification(DistilBertPreTrainedModel): + def __init__(self, config: PretrainedConfig): + super().__init__(config) + self.num_labels = config.num_labels + self.config = config + + self.distilbert = DistilBertModel(config) + self.pre_classifier = nn.Linear(config.dim, config.dim) + self.classifier = nn.Linear(config.dim, config.num_labels) + self.dropout = nn.Dropout(config.seq_classif_dropout) + + # Initialize weights and apply final processing + self.post_init() + + def get_position_embeddings(self) -> nn.Embedding: + """ + Returns the position embeddings + """ + return self.distilbert.get_position_embeddings() + + def resize_position_embeddings(self, new_num_position_embeddings: int): + """ + Resizes position embeddings of the model if `new_num_position_embeddings != config.max_position_embeddings`. + + Arguments: + new_num_position_embeddings (`int`): + The number of new position embedding matrix. If position embeddings are learned, increasing the size + will add newly initialized vectors at the end, whereas reducing the size will remove vectors from the + end. If position embeddings are not learned (*e.g.* sinusoidal position embeddings), increasing the + size will add correct vectors at the end following the position encoding algorithm, whereas reducing + the size will remove vectors from the end. + """ + self.distilbert.resize_position_embeddings(new_num_position_embeddings) + + @auto_docstring + def forward( + self, + input_ids: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + head_mask: Optional[torch.Tensor] = None, + inputs_embeds: Optional[torch.Tensor] = None, + labels: Optional[torch.LongTensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[SequenceClassifierOutput, tuple[torch.Tensor, ...]]: + r""" + labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., + config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If + `config.num_labels > 1` a classification loss is computed (Cross-Entropy). + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + distilbert_output = self.distilbert( + input_ids=input_ids, + attention_mask=attention_mask, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + hidden_state = distilbert_output[0] # (bs, seq_len, dim) + pooled_output = hidden_state[:, 0] # (bs, dim) + pooled_output = self.pre_classifier(pooled_output) # (bs, dim) + pooled_output = nn.ReLU()(pooled_output) # (bs, dim) + pooled_output = self.dropout(pooled_output) # (bs, dim) + logits = self.classifier(pooled_output) # (bs, num_labels) + + loss = None + if labels is not None: + if self.config.problem_type is None: + if self.num_labels == 1: + self.config.problem_type = "regression" + elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int): + self.config.problem_type = "single_label_classification" + else: + self.config.problem_type = "multi_label_classification" + + if self.config.problem_type == "regression": + loss_fct = MSELoss() + if self.num_labels == 1: + loss = loss_fct(logits.squeeze(), labels.squeeze()) + else: + loss = loss_fct(logits, labels) + elif self.config.problem_type == "single_label_classification": + loss_fct = CrossEntropyLoss() + loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) + elif self.config.problem_type == "multi_label_classification": + loss_fct = BCEWithLogitsLoss() + loss = loss_fct(logits, labels) + + if not return_dict: + output = (logits,) + distilbert_output[1:] + return ((loss,) + output) if loss is not None else output + + return SequenceClassifierOutput( + loss=loss, + logits=logits, + hidden_states=distilbert_output.hidden_states, + attentions=distilbert_output.attentions, + ) + + +@auto_docstring +class DistilBertForQuestionAnswering(DistilBertPreTrainedModel): + def __init__(self, config: PretrainedConfig): + super().__init__(config) + + self.distilbert = DistilBertModel(config) + self.qa_outputs = nn.Linear(config.dim, config.num_labels) + if config.num_labels != 2: + raise ValueError(f"config.num_labels should be 2, but it is {config.num_labels}") + + self.dropout = nn.Dropout(config.qa_dropout) + + # Initialize weights and apply final processing + self.post_init() + + def get_position_embeddings(self) -> nn.Embedding: + """ + Returns the position embeddings + """ + return self.distilbert.get_position_embeddings() + + def resize_position_embeddings(self, new_num_position_embeddings: int): + """ + Resizes position embeddings of the model if `new_num_position_embeddings != config.max_position_embeddings`. + + Arguments: + new_num_position_embeddings (`int`): + The number of new position embedding matrix. If position embeddings are learned, increasing the size + will add newly initialized vectors at the end, whereas reducing the size will remove vectors from the + end. If position embeddings are not learned (*e.g.* sinusoidal position embeddings), increasing the + size will add correct vectors at the end following the position encoding algorithm, whereas reducing + the size will remove vectors from the end. + """ + self.distilbert.resize_position_embeddings(new_num_position_embeddings) + + @auto_docstring + def forward( + self, + input_ids: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + head_mask: Optional[torch.Tensor] = None, + inputs_embeds: Optional[torch.Tensor] = None, + start_positions: Optional[torch.Tensor] = None, + end_positions: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[QuestionAnsweringModelOutput, tuple[torch.Tensor, ...]]: + r""" + input_ids (`torch.LongTensor` of shape `(batch_size, num_choices)`): + Indices of input sequence tokens in the vocabulary. + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + [What are input IDs?](../glossary#input-ids) + inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_choices, hidden_size)`, *optional*): + Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This + is useful if you want more control over how to convert `input_ids` indices into associated vectors than the + model's internal embedding lookup matrix. + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + distilbert_output = self.distilbert( + input_ids=input_ids, + attention_mask=attention_mask, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + hidden_states = distilbert_output[0] # (bs, max_query_len, dim) + + hidden_states = self.dropout(hidden_states) # (bs, max_query_len, dim) + logits = self.qa_outputs(hidden_states) # (bs, max_query_len, 2) + start_logits, end_logits = logits.split(1, dim=-1) + start_logits = start_logits.squeeze(-1).contiguous() # (bs, max_query_len) + end_logits = end_logits.squeeze(-1).contiguous() # (bs, max_query_len) + + total_loss = None + if start_positions is not None and end_positions is not None: + # If we are on multi-GPU, split add a dimension + if len(start_positions.size()) > 1: + start_positions = start_positions.squeeze(-1) + if len(end_positions.size()) > 1: + end_positions = end_positions.squeeze(-1) + # sometimes the start/end positions are outside our model inputs, we ignore these terms + ignored_index = start_logits.size(1) + start_positions = start_positions.clamp(0, ignored_index) + end_positions = end_positions.clamp(0, ignored_index) + + loss_fct = nn.CrossEntropyLoss(ignore_index=ignored_index) + start_loss = loss_fct(start_logits, start_positions) + end_loss = loss_fct(end_logits, end_positions) + total_loss = (start_loss + end_loss) / 2 + + if not return_dict: + output = (start_logits, end_logits) + distilbert_output[1:] + return ((total_loss,) + output) if total_loss is not None else output + + return QuestionAnsweringModelOutput( + loss=total_loss, + start_logits=start_logits, + end_logits=end_logits, + hidden_states=distilbert_output.hidden_states, + attentions=distilbert_output.attentions, + ) + + +@auto_docstring +class DistilBertForTokenClassification(DistilBertPreTrainedModel): + def __init__(self, config: PretrainedConfig): + super().__init__(config) + self.num_labels = config.num_labels + + self.distilbert = DistilBertModel(config) + self.dropout = nn.Dropout(config.dropout) + self.classifier = nn.Linear(config.hidden_size, config.num_labels) + + # Initialize weights and apply final processing + self.post_init() + + def get_position_embeddings(self) -> nn.Embedding: + """ + Returns the position embeddings + """ + return self.distilbert.get_position_embeddings() + + def resize_position_embeddings(self, new_num_position_embeddings: int): + """ + Resizes position embeddings of the model if `new_num_position_embeddings != config.max_position_embeddings`. + + Arguments: + new_num_position_embeddings (`int`): + The number of new position embedding matrix. If position embeddings are learned, increasing the size + will add newly initialized vectors at the end, whereas reducing the size will remove vectors from the + end. If position embeddings are not learned (*e.g.* sinusoidal position embeddings), increasing the + size will add correct vectors at the end following the position encoding algorithm, whereas reducing + the size will remove vectors from the end. + """ + self.distilbert.resize_position_embeddings(new_num_position_embeddings) + + @auto_docstring + def forward( + self, + input_ids: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + head_mask: Optional[torch.Tensor] = None, + inputs_embeds: Optional[torch.Tensor] = None, + labels: Optional[torch.LongTensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[TokenClassifierOutput, tuple[torch.Tensor, ...]]: + r""" + labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`. + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + outputs = self.distilbert( + input_ids, + attention_mask=attention_mask, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + sequence_output = outputs[0] + + sequence_output = self.dropout(sequence_output) + logits = self.classifier(sequence_output) + + loss = None + if labels is not None: + loss_fct = CrossEntropyLoss() + loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) + + if not return_dict: + output = (logits,) + outputs[1:] + return ((loss,) + output) if loss is not None else output + + return TokenClassifierOutput( + loss=loss, + logits=logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +@auto_docstring +class DistilBertForMultipleChoice(DistilBertPreTrainedModel): + def __init__(self, config: PretrainedConfig): + super().__init__(config) + + self.distilbert = DistilBertModel(config) + self.pre_classifier = nn.Linear(config.dim, config.dim) + self.classifier = nn.Linear(config.dim, 1) + self.dropout = nn.Dropout(config.seq_classif_dropout) + + # Initialize weights and apply final processing + self.post_init() + + def get_position_embeddings(self) -> nn.Embedding: + """ + Returns the position embeddings + """ + return self.distilbert.get_position_embeddings() + + def resize_position_embeddings(self, new_num_position_embeddings: int): + """ + Resizes position embeddings of the model if `new_num_position_embeddings != config.max_position_embeddings`. + + Arguments: + new_num_position_embeddings (`int`) + The number of new position embeddings. If position embeddings are learned, increasing the size will add + newly initialized vectors at the end, whereas reducing the size will remove vectors from the end. If + position embeddings are not learned (*e.g.* sinusoidal position embeddings), increasing the size will + add correct vectors at the end following the position encoding algorithm, whereas reducing the size + will remove vectors from the end. + """ + self.distilbert.resize_position_embeddings(new_num_position_embeddings) + + @auto_docstring + def forward( + self, + input_ids: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + head_mask: Optional[torch.Tensor] = None, + inputs_embeds: Optional[torch.Tensor] = None, + labels: Optional[torch.LongTensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[MultipleChoiceModelOutput, tuple[torch.Tensor, ...]]: + r""" + input_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`): + Indices of input sequence tokens in the vocabulary. + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + [What are input IDs?](../glossary#input-ids) + inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_choices, sequence_length, hidden_size)`, *optional*): + Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This + is useful if you want more control over how to convert `input_ids` indices into associated vectors than the + model's internal embedding lookup matrix. + labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels for computing the multiple choice classification loss. Indices should be in `[0, ..., + num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See + `input_ids` above) + + Examples: + + ```python + >>> from transformers import AutoTokenizer, DistilBertForMultipleChoice + >>> import torch + + >>> tokenizer = AutoTokenizer.from_pretrained("distilbert-base-cased") + >>> model = DistilBertForMultipleChoice.from_pretrained("distilbert-base-cased") + + >>> prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced." + >>> choice0 = "It is eaten with a fork and a knife." + >>> choice1 = "It is eaten while held in the hand." + >>> labels = torch.tensor(0).unsqueeze(0) # choice0 is correct (according to Wikipedia ;)), batch size 1 + + >>> encoding = tokenizer([[prompt, choice0], [prompt, choice1]], return_tensors="pt", padding=True) + >>> outputs = model(**{k: v.unsqueeze(0) for k, v in encoding.items()}, labels=labels) # batch size is 1 + + >>> # the linear classifier still needs to be trained + >>> loss = outputs.loss + >>> logits = outputs.logits + ```""" + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1] + + input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None + attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None + inputs_embeds = ( + inputs_embeds.view(-1, inputs_embeds.size(-2), inputs_embeds.size(-1)) + if inputs_embeds is not None + else None + ) + + outputs = self.distilbert( + input_ids, + attention_mask=attention_mask, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + hidden_state = outputs[0] # (bs * num_choices, seq_len, dim) + pooled_output = hidden_state[:, 0] # (bs * num_choices, dim) + pooled_output = self.pre_classifier(pooled_output) # (bs * num_choices, dim) + pooled_output = nn.ReLU()(pooled_output) # (bs * num_choices, dim) + pooled_output = self.dropout(pooled_output) # (bs * num_choices, dim) + logits = self.classifier(pooled_output) # (bs * num_choices, 1) + + reshaped_logits = logits.view(-1, num_choices) # (bs, num_choices) + + loss = None + if labels is not None: + loss_fct = CrossEntropyLoss() + loss = loss_fct(reshaped_logits, labels) + + if not return_dict: + output = (reshaped_logits,) + outputs[1:] + return ((loss,) + output) if loss is not None else output + + return MultipleChoiceModelOutput( + loss=loss, + logits=reshaped_logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +__all__ = [ + "DistilBertForMaskedLM", + "DistilBertForMultipleChoice", + "DistilBertForQuestionAnswering", + "DistilBertForSequenceClassification", + "DistilBertForTokenClassification", + "DistilBertModel", + "DistilBertPreTrainedModel", +] diff --git a/transformers/src/transformers/models/distilbert/modeling_flax_distilbert.py b/transformers/src/transformers/models/distilbert/modeling_flax_distilbert.py new file mode 100644 index 0000000000000000000000000000000000000000..fba3dfd9d332d5f34dfea23a20cec7a7871de9e4 --- /dev/null +++ b/transformers/src/transformers/models/distilbert/modeling_flax_distilbert.py @@ -0,0 +1,906 @@ +# coding=utf-8 +# Copyright 2019-present, the HuggingFace Inc. team, The Google AI Language Team and Facebook, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import math +from typing import Callable, Optional + +import flax.linen as nn +import jax +import jax.numpy as jnp +import numpy as np +from flax.core.frozen_dict import FrozenDict, freeze, unfreeze +from flax.traverse_util import flatten_dict, unflatten_dict +from jax import lax + +from ...modeling_flax_outputs import ( + FlaxBaseModelOutput, + FlaxMaskedLMOutput, + FlaxMultipleChoiceModelOutput, + FlaxQuestionAnsweringModelOutput, + FlaxSequenceClassifierOutput, + FlaxTokenClassifierOutput, +) +from ...modeling_flax_utils import ACT2FN, FlaxPreTrainedModel, append_call_sample_docstring, overwrite_call_docstring +from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging +from .configuration_distilbert import DistilBertConfig + + +logger = logging.get_logger(__name__) + +_CHECKPOINT_FOR_DOC = "distilbert-base-uncased" +_CONFIG_FOR_DOC = "DistilBertConfig" + + +FLAX_DISTILBERT_START_DOCSTRING = r""" + + This model inherits from [`FlaxPreTrainedModel`]. Check the superclass documentation for the generic methods the + library implements for all its model (such as downloading, saving and converting weights from PyTorch models) + + This model is also a + [flax.linen.Module](https://flax.readthedocs.io/en/latest/api_reference/flax.linen/module.html) subclass. Use it as + a regular Flax linen Module and refer to the Flax documentation for all matter related to general usage and + behavior. + + Finally, this model supports inherent JAX features such as: + + - [Just-In-Time (JIT) compilation](https://jax.readthedocs.io/en/latest/jax.html#just-in-time-compilation-jit) + - [Automatic Differentiation](https://jax.readthedocs.io/en/latest/jax.html#automatic-differentiation) + - [Vectorization](https://jax.readthedocs.io/en/latest/jax.html#vectorization-vmap) + - [Parallelization](https://jax.readthedocs.io/en/latest/jax.html#parallelization-pmap) + + Parameters: + config ([`DistilBertConfig`]): Model configuration class with all the parameters of the model. + Initializing with a config file does not load the weights associated with the model, only the + configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights. +""" + +DISTILBERT_INPUTS_DOCSTRING = r""" + Args: + input_ids (`numpy.ndarray` of shape `({0})`): + Indices of input sequence tokens in the vocabulary. + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + [What are input IDs?](../glossary#input-ids) + attention_mask (`numpy.ndarray` of shape `({0})`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned + tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. +""" + + +def get_angles(pos, i, d_model): + angle_rates = 1 / np.power(10000, (2 * (i // 2)) / np.float32(d_model)) + return pos * angle_rates + + +def positional_encoding(position, d_model): + # create the sinusoidal pattern for the positional encoding + angle_rads = get_angles(np.arange(position)[:, np.newaxis], np.arange(d_model)[np.newaxis, :], d_model) + + # apply sin to even indices in the array; 2i + angle_rads[:, 0::2] = np.sin(angle_rads[:, 0::2]) + + # apply cos to odd indices in the array; 2i+1 + angle_rads[:, 1::2] = np.cos(angle_rads[:, 1::2]) + + pos_encoding = angle_rads[np.newaxis, ...] + + return jnp.array(pos_encoding) + + +class FlaxEmbeddings(nn.Module): + """Construct the embeddings from word, position and token_type embeddings.""" + + config: DistilBertConfig + dtype: jnp.dtype = jnp.float32 # the dtype of the computation + + def setup(self): + self.word_embeddings = nn.Embed( + self.config.vocab_size, + self.config.dim, + embedding_init=jax.nn.initializers.normal(stddev=self.config.initializer_range), + ) + if not self.config.sinusoidal_pos_embds: + self.position_embeddings = nn.Embed( + self.config.max_position_embeddings, + self.config.dim, + embedding_init=jax.nn.initializers.normal(stddev=self.config.initializer_range), + ) + else: + self.pos_encoding = positional_encoding(self.config.max_position_embeddings, self.config.dim) + self.LayerNorm = nn.LayerNorm(epsilon=1e-12, dtype=self.dtype) + self.dropout = nn.Dropout(rate=self.config.dropout) + + def __call__(self, input_ids, deterministic: bool = True): + # Embed + batch_size, seq_length = input_ids.shape + inputs_embeds = self.word_embeddings(input_ids.astype("i4")) + if not self.config.sinusoidal_pos_embds: + position_ids = jnp.arange(seq_length).astype("i4") + position_ids = jnp.broadcast_to(position_ids, shape=(batch_size, seq_length)) + position_embeds = self.position_embeddings(position_ids.astype("i4")) + else: + position_embeds = self.pos_encoding[:, :seq_length, :] + # explicitly cast the positions here, since self.embed_positions are not registered as parameters + position_embeds = position_embeds.astype(inputs_embeds.dtype) + + # Sum all embeddings + hidden_states = inputs_embeds + position_embeds + + # Layer Norm + hidden_states = self.LayerNorm(hidden_states) + hidden_states = self.dropout(hidden_states, deterministic=deterministic) + return hidden_states + + +class FlaxMultiHeadSelfAttention(nn.Module): + config: DistilBertConfig + dtype: jnp.dtype = jnp.float32 # the dtype of the computation + + def setup(self): + self.n_heads = self.config.n_heads + self.dim = self.config.dim + self.dropout = nn.Dropout(rate=self.config.attention_dropout) + + if not (self.dim % self.n_heads == 0): + raise ValueError(f"Hidden size {self.dim} not dividable by number of heads {self.n_heads}") + + self.q_lin = nn.Dense( + self.dim, + dtype=self.dtype, + kernel_init=jax.nn.initializers.normal(stddev=self.config.initializer_range), + ) + self.k_lin = nn.Dense( + self.dim, + dtype=self.dtype, + kernel_init=jax.nn.initializers.normal(stddev=self.config.initializer_range), + ) + self.v_lin = nn.Dense( + self.dim, + dtype=self.dtype, + kernel_init=jax.nn.initializers.normal(stddev=self.config.initializer_range), + ) + self.out_lin = nn.Dense( + self.dim, + dtype=self.dtype, + kernel_init=jax.nn.initializers.normal(stddev=self.config.initializer_range), + ) + + def __call__( + self, + query, + key, + value, + mask, + deterministic: bool = True, + output_attentions: bool = False, + ): + bs, q_len, dim = query.shape + k_len = key.shape[1] + # assert dim == self.dim, f'Dimensions do not match: {dim} input vs {self.dim} configured' + # assert key.size() == value.size() + + dim_per_head = self.dim // self.n_heads + + mask_reshp = (bs, 1, 1, k_len) + + def shape(x): + """separate heads""" + return x.reshape(bs, -1, self.n_heads, dim_per_head).transpose(0, 2, 1, 3) + + def unshape(x): + """group heads""" + return x.transpose(0, 2, 1, 3).reshape(bs, -1, self.n_heads * dim_per_head) + + q = shape(self.q_lin(query)) # (bs, n_heads, q_len, dim_per_head) + k = shape(self.k_lin(key)) # (bs, n_heads, k_len, dim_per_head) + v = shape(self.v_lin(value)) # (bs, n_heads, k_len, dim_per_head) + + q = q / math.sqrt(dim_per_head) # (bs, n_heads, q_len, dim_per_head) + scores = jnp.matmul(q, k.transpose(0, 1, 3, 2)) # (bs, n_heads, q_len, k_len) + mask = jnp.reshape(mask, mask_reshp) + + mask = mask.astype(scores.dtype) + scores = scores - 1e30 * (1.0 - mask) + + weights = nn.softmax(scores, axis=-1) # (bs, n_heads, q_len, k_len) + weights = self.dropout(weights, deterministic=deterministic) + + context = jnp.matmul(weights, v) # (bs, n_heads, q_len, dim_per_head) + context = unshape(context) # (bs, q_len, dim) + context = self.out_lin(context) # (bs, q_len, dim) + + if output_attentions: + return (context, weights) + else: + return (context,) + + +class FlaxFFN(nn.Module): + config: DistilBertConfig + dtype: jnp.dtype = jnp.float32 # the dtype of the computation + + def setup(self): + self.dropout = nn.Dropout(rate=self.config.dropout) + self.chunk_size_feed_forward = self.config.chunk_size_feed_forward + self.seq_len_dim = 1 + self.lin1 = nn.Dense( + self.config.hidden_dim, + dtype=self.dtype, + kernel_init=jax.nn.initializers.normal(stddev=self.config.initializer_range), + ) + self.lin2 = nn.Dense( + self.config.dim, + dtype=self.dtype, + kernel_init=jax.nn.initializers.normal(stddev=self.config.initializer_range), + ) + + self.activation = ACT2FN[self.config.activation] + + def __call__(self, hidden_states, deterministic: bool = True): + hidden_states = self.lin1(hidden_states) + hidden_states = self.activation(hidden_states) + hidden_states = self.lin2(hidden_states) + hidden_states = self.dropout(hidden_states, deterministic=deterministic) + return hidden_states + + +class FlaxTransformerBlock(nn.Module): + config: DistilBertConfig + dtype: jnp.dtype = jnp.float32 # the dtype of the computation + + def setup(self): + assert self.config.dim % self.config.n_heads == 0, ( + f"Hidden size {self.config.dim} not dividable by number of heads {self.config.n_heads}" + ) + + self.attention = FlaxMultiHeadSelfAttention(self.config, dtype=self.dtype) + self.sa_layer_norm = nn.LayerNorm(epsilon=1e-12, dtype=self.dtype) + + self.ffn = FlaxFFN(self.config, dtype=self.dtype) + self.output_layer_norm = nn.LayerNorm(epsilon=1e-12, dtype=self.dtype) + + def __call__( + self, + hidden_states, + attn_mask, + output_attentions: bool = False, + deterministic: bool = True, + ): + # Self-Attention + sa_output = self.attention( + query=hidden_states, + key=hidden_states, + value=hidden_states, + mask=attn_mask, + output_attentions=output_attentions, + deterministic=deterministic, + ) + if output_attentions: + sa_output, sa_weights = sa_output + else: + assert type(sa_output) is tuple + sa_output = sa_output[0] + sa_output = self.sa_layer_norm(sa_output + hidden_states) + + # Feed Forward Network + ffn_output = self.ffn(sa_output, deterministic=deterministic) + ffn_output = self.output_layer_norm(ffn_output + sa_output) + output = (ffn_output,) + if output_attentions: + output = (sa_weights,) + output + return output + + +class FlaxTransformer(nn.Module): + config: DistilBertConfig + dtype: jnp.dtype = jnp.float32 # the dtype of the computation + + def setup(self): + self.layers = [ + FlaxTransformerBlock(self.config, name=str(i), dtype=self.dtype) for i in range(self.config.n_layers) + ] + + def __call__( + self, + hidden_states, + attention_mask, + output_attentions: bool = False, + output_hidden_states: bool = False, + deterministic: bool = True, + return_dict: bool = False, + ): + all_hidden_states = () if output_hidden_states else None + all_attentions = () if output_attentions else None + + for layer_module in self.layers: + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + layer_outputs = layer_module( + hidden_states=hidden_states, + attn_mask=attention_mask, + output_attentions=output_attentions, + deterministic=deterministic, + ) + hidden_states = layer_outputs[-1] + + if output_attentions: + assert len(layer_outputs) == 2 + attentions = layer_outputs[0] + all_attentions = all_attentions + (attentions,) + else: + assert len(layer_outputs) == 1 + + # Add last layer + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + if not return_dict: + return tuple(v for v in [hidden_states, all_attentions, all_hidden_states] if v is not None) + return FlaxBaseModelOutput( + last_hidden_state=hidden_states, hidden_states=all_hidden_states, attentions=all_attentions + ) + + +class FlaxTransformerEncoder(nn.Module): + config: DistilBertConfig + dtype: jnp.dtype = jnp.float32 # the dtype of the computation + + def setup(self): + self.layer = FlaxTransformer(self.config, dtype=self.dtype) + + def __call__( + self, + hidden_states, + attention_mask, + output_attentions: bool = False, + output_hidden_states: bool = False, + deterministic: bool = True, + return_dict: bool = False, + ): + return self.layer( + hidden_states=hidden_states, + attention_mask=attention_mask, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + deterministic=deterministic, + return_dict=return_dict, + ) + + +class FlaxDistilBertLMDecoder(nn.Module): + config: DistilBertConfig + dtype: jnp.dtype = jnp.float32 # the dtype of the computation + bias_init: Callable[..., np.ndarray] = jax.nn.initializers.zeros + + def setup(self): + self.bias = self.param("bias", self.bias_init, (self.config.vocab_size,)) + + def __call__(self, inputs, kernel): + inputs = jnp.asarray(inputs, self.dtype) + kernel = jnp.asarray(kernel, self.dtype) + y = lax.dot_general(inputs, kernel, (((inputs.ndim - 1,), (0,)), ((), ()))) + bias = jnp.asarray(self.bias, self.dtype) + y = y + bias + return y + + +class FlaxDistilBertPreTrainedModel(FlaxPreTrainedModel): + """ + An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained + models. + """ + + config_class = DistilBertConfig + base_model_prefix = "distilbert" + module_class: nn.Module = None + + def __init__( + self, + config: DistilBertConfig, + input_shape: tuple = (1, 1), + seed: int = 0, + dtype: jnp.dtype = jnp.float32, + _do_init: bool = True, + **kwargs, + ): + module = self.module_class(config=config, dtype=dtype, **kwargs) + super().__init__(config, module, input_shape=input_shape, seed=seed, dtype=dtype, _do_init=_do_init) + + def init_weights(self, rng: jax.random.PRNGKey, input_shape: tuple, params: FrozenDict = None) -> FrozenDict: + # init input tensors + input_ids = jnp.zeros(input_shape, dtype="i4") + attention_mask = jnp.ones_like(input_ids) + + params_rng, dropout_rng = jax.random.split(rng) + rngs = {"params": params_rng, "dropout": dropout_rng} + + random_params = self.module.init(rngs, input_ids, attention_mask, return_dict=False)["params"] + + if params is not None: + random_params = flatten_dict(unfreeze(random_params)) + params = flatten_dict(unfreeze(params)) + for missing_key in self._missing_keys: + params[missing_key] = random_params[missing_key] + self._missing_keys = set() + return freeze(unflatten_dict(params)) + else: + return random_params + + @add_start_docstrings_to_model_forward(DISTILBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) + def __call__( + self, + input_ids, + attention_mask=None, + head_mask=None, + params: Optional[dict] = None, + dropout_rng: jax.random.PRNGKey = None, + train: bool = False, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ): + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.return_dict + + if attention_mask is None: + attention_mask = jnp.ones_like(input_ids) + + # Handle any PRNG if needed + rngs = {} + if dropout_rng is not None: + rngs["dropout"] = dropout_rng + + return self.module.apply( + {"params": params or self.params}, + jnp.array(input_ids, dtype="i4"), + jnp.array(attention_mask, dtype="i4"), + not train, + output_attentions, + output_hidden_states, + return_dict, + rngs=rngs, + ) + + +class FlaxDistilBertModule(nn.Module): + config: DistilBertConfig + dtype: jnp.dtype = jnp.float32 # the dtype of the computation + + def setup(self): + self.embeddings = FlaxEmbeddings(self.config, dtype=self.dtype) + self.transformer = FlaxTransformerEncoder(self.config, dtype=self.dtype) + + def __call__( + self, + input_ids, + attention_mask, + deterministic: bool = True, + output_attentions: bool = False, + output_hidden_states: bool = False, + return_dict: bool = True, + ): + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.return_dict + + input_embeds = self.embeddings(input_ids, deterministic=deterministic) + return self.transformer( + hidden_states=input_embeds, + attention_mask=attention_mask, + deterministic=deterministic, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + +@add_start_docstrings( + "The bare DistilBert Model transformer outputting raw hidden-states without any specific head on top.", + FLAX_DISTILBERT_START_DOCSTRING, +) +class FlaxDistilBertModel(FlaxDistilBertPreTrainedModel): + module_class = FlaxDistilBertModule + + +append_call_sample_docstring(FlaxDistilBertModel, _CHECKPOINT_FOR_DOC, None, _CONFIG_FOR_DOC) + + +class FlaxDistilBertForMaskedLMModule(nn.Module): + config: DistilBertConfig + dtype: jnp.dtype = jnp.float32 # the dtype of the computation + + def setup(self): + self.distilbert = FlaxDistilBertModule(self.config, dtype=self.dtype) + self.vocab_transform = nn.Dense( + self.config.dim, + dtype=self.dtype, + kernel_init=jax.nn.initializers.normal(stddev=self.config.initializer_range), + ) + self.vocab_layer_norm = nn.LayerNorm(epsilon=1e-12, dtype=self.dtype) + if self.config.tie_word_embeddings: + self.vocab_projector = FlaxDistilBertLMDecoder( + self.config, + dtype=self.dtype, + ) + else: + self.vocab_projector = nn.Dense( + self.config.vocab_size, + dtype=self.dtype, + kernel_init=jax.nn.initializers.normal(stddev=self.config.initializer_range), + ) + + def __call__( + self, + input_ids, + attention_mask, + deterministic: bool = True, + output_attentions: bool = False, + output_hidden_states: bool = False, + return_dict: bool = True, + ): + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + dlbrt_output = self.distilbert( + input_ids=input_ids, + attention_mask=attention_mask, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + deterministic=deterministic, + return_dict=return_dict, + ) + hidden_states = dlbrt_output[0] + prediction_logits = self.vocab_transform(hidden_states) + prediction_logits = ACT2FN[self.config.activation](prediction_logits) + prediction_logits = self.vocab_layer_norm(prediction_logits) + + if self.config.tie_word_embeddings: + shared_embedding = self.distilbert.variables["params"]["embeddings"]["word_embeddings"]["embedding"] + prediction_logits = self.vocab_projector(prediction_logits, shared_embedding.T) + else: + prediction_logits = self.vocab_projector(prediction_logits) + + if not return_dict: + output = (prediction_logits,) + dlbrt_output[1:] + return output + + return FlaxMaskedLMOutput( + logits=prediction_logits, + hidden_states=dlbrt_output.hidden_states, + attentions=dlbrt_output.attentions, + ) + + +@add_start_docstrings("""DistilBert Model with a `language modeling` head on top.""", FLAX_DISTILBERT_START_DOCSTRING) +class FlaxDistilBertForMaskedLM(FlaxDistilBertPreTrainedModel): + module_class = FlaxDistilBertForMaskedLMModule + + +append_call_sample_docstring(FlaxDistilBertForMaskedLM, _CHECKPOINT_FOR_DOC, FlaxMaskedLMOutput, _CONFIG_FOR_DOC) + + +class FlaxDistilBertForSequenceClassificationModule(nn.Module): + config: DistilBertConfig + dtype: jnp.dtype = jnp.float32 + + def setup(self): + self.distilbert = FlaxDistilBertModule(config=self.config, dtype=self.dtype) + self.pre_classifier = nn.Dense( + self.config.dim, + dtype=self.dtype, + kernel_init=jax.nn.initializers.normal(stddev=self.config.initializer_range), + ) + self.dropout = nn.Dropout(rate=self.config.seq_classif_dropout) + self.classifier = nn.Dense( + self.config.num_labels, + dtype=self.dtype, + ) + + def __call__( + self, + input_ids, + attention_mask, + deterministic: bool = True, + output_attentions: bool = False, + output_hidden_states: bool = False, + return_dict: bool = True, + ): + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + # Model + distilbert_output = self.distilbert( + input_ids, + attention_mask, + deterministic=deterministic, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + hidden_state = distilbert_output[0] # (bs, seq_len, dim) + pooled_output = hidden_state[:, 0] # (bs, dim) + pooled_output = self.pre_classifier(pooled_output) # (bs, dim) + pooled_output = ACT2FN["relu"](pooled_output) + pooled_output = self.dropout(pooled_output, deterministic=deterministic) + logits = self.classifier(pooled_output) # (bs, dim) + + if not return_dict: + return (logits,) + distilbert_output[1:] + + return FlaxSequenceClassifierOutput( + logits=logits, + hidden_states=distilbert_output.hidden_states, + attentions=distilbert_output.attentions, + ) + + +@add_start_docstrings( + """ + DistilBert Model transformer with a sequence classification/regression head on top (a linear layer on top of the + pooled output) e.g. for GLUE tasks. + """, + FLAX_DISTILBERT_START_DOCSTRING, +) +class FlaxDistilBertForSequenceClassification(FlaxDistilBertPreTrainedModel): + module_class = FlaxDistilBertForSequenceClassificationModule + + +append_call_sample_docstring( + FlaxDistilBertForSequenceClassification, + _CHECKPOINT_FOR_DOC, + FlaxSequenceClassifierOutput, + _CONFIG_FOR_DOC, +) + + +class FlaxDistilBertForMultipleChoiceModule(nn.Module): + config: DistilBertConfig + dtype: jnp.dtype = jnp.float32 + + def setup(self): + self.distilbert = FlaxDistilBertModule(config=self.config, dtype=self.dtype) + self.pre_classifier = nn.Dense( + self.config.dim, + dtype=self.dtype, + kernel_init=jax.nn.initializers.normal(stddev=self.config.initializer_range), + ) + self.dropout = nn.Dropout(rate=self.config.seq_classif_dropout) + self.classifier = nn.Dense( + 1, + dtype=self.dtype, + ) + + def __call__( + self, + input_ids, + attention_mask, + deterministic: bool = True, + output_attentions: bool = False, + output_hidden_states: bool = False, + return_dict: bool = True, + ): + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + num_choices = input_ids.shape[1] + input_ids = input_ids.reshape(-1, input_ids.shape[-1]) if input_ids is not None else None + attention_mask = attention_mask.reshape(-1, attention_mask.shape[-1]) if attention_mask is not None else None + + # Model + outputs = self.distilbert( + input_ids, + attention_mask, + deterministic=deterministic, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + hidden_state = outputs[0] + pooled_output = hidden_state[:, 0] + pooled_output = self.pre_classifier(pooled_output) + pooled_output = ACT2FN["relu"](pooled_output) + pooled_output = self.dropout(pooled_output, deterministic=deterministic) + logits = self.classifier(pooled_output) + + reshaped_logits = logits.reshape(-1, num_choices) + + if not return_dict: + return (reshaped_logits,) + outputs[2:] + + return FlaxMultipleChoiceModelOutput( + logits=reshaped_logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +@add_start_docstrings( + """ + DistilBert Model with a multiple choice classification head on top (a linear layer on top of the pooled output and + a softmax) e.g. for RocStories/SWAG tasks. + """, + FLAX_DISTILBERT_START_DOCSTRING, +) +class FlaxDistilBertForMultipleChoice(FlaxDistilBertPreTrainedModel): + module_class = FlaxDistilBertForMultipleChoiceModule + + +overwrite_call_docstring( + FlaxDistilBertForMultipleChoice, DISTILBERT_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length") +) +append_call_sample_docstring( + FlaxDistilBertForMultipleChoice, + _CHECKPOINT_FOR_DOC, + FlaxMultipleChoiceModelOutput, + _CONFIG_FOR_DOC, +) + + +class FlaxDistilBertForTokenClassificationModule(nn.Module): + config: DistilBertConfig + dtype: jnp.dtype = jnp.float32 + + def setup(self): + self.distilbert = FlaxDistilBertModule(config=self.config, dtype=self.dtype) + self.dropout = nn.Dropout(rate=self.config.dropout) + self.classifier = nn.Dense(self.config.num_labels, dtype=self.dtype) + + def __call__( + self, + input_ids, + attention_mask, + deterministic: bool = True, + output_attentions: bool = False, + output_hidden_states: bool = False, + return_dict: bool = True, + ): + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + # Model + outputs = self.distilbert( + input_ids, + attention_mask, + deterministic=deterministic, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + hidden_states = outputs[0] + hidden_states = self.dropout(hidden_states, deterministic=deterministic) + logits = self.classifier(hidden_states) + + if not return_dict: + return (logits,) + outputs[1:] + + return FlaxTokenClassifierOutput( + logits=logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +@add_start_docstrings( + """ + DistilBert Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. + for Named-Entity-Recognition (NER) tasks. + """, + FLAX_DISTILBERT_START_DOCSTRING, +) +class FlaxDistilBertForTokenClassification(FlaxDistilBertPreTrainedModel): + module_class = FlaxDistilBertForTokenClassificationModule + + +append_call_sample_docstring( + FlaxDistilBertForTokenClassification, + _CHECKPOINT_FOR_DOC, + FlaxTokenClassifierOutput, + _CONFIG_FOR_DOC, +) + + +class FlaxDistilBertForQuestionAnsweringModule(nn.Module): + config: DistilBertConfig + dtype: jnp.dtype = jnp.float32 + + def setup(self): + self.distilbert = FlaxDistilBertModule(config=self.config, dtype=self.dtype) + self.qa_outputs = nn.Dense(self.config.num_labels, dtype=self.dtype) + assert self.config.num_labels == 2 + self.dropout = nn.Dropout(rate=self.config.qa_dropout) + + def __call__( + self, + input_ids, + attention_mask, + deterministic: bool = True, + output_attentions: bool = False, + output_hidden_states: bool = False, + return_dict: bool = True, + ): + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + # Model + distilbert_output = self.distilbert( + input_ids, + attention_mask, + deterministic=deterministic, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + hidden_states = distilbert_output[0] + + hidden_states = self.dropout(hidden_states, deterministic=deterministic) + logits = self.qa_outputs(hidden_states) + start_logits, end_logits = jnp.split(logits, self.config.num_labels, axis=-1) + start_logits = start_logits.squeeze(-1) + end_logits = end_logits.squeeze(-1) + + if not return_dict: + return (start_logits, end_logits) + distilbert_output[1:] + + return FlaxQuestionAnsweringModelOutput( + start_logits=start_logits, + end_logits=end_logits, + hidden_states=distilbert_output.hidden_states, + attentions=distilbert_output.attentions, + ) + + +@add_start_docstrings( + """ + DistilBert Model with a span classification head on top for extractive question-answering tasks like SQuAD (a + linear layers on top of the hidden-states output to compute `span start logits` and `span end logits`). + """, + FLAX_DISTILBERT_START_DOCSTRING, +) +class FlaxDistilBertForQuestionAnswering(FlaxDistilBertPreTrainedModel): + module_class = FlaxDistilBertForQuestionAnsweringModule + + +append_call_sample_docstring( + FlaxDistilBertForQuestionAnswering, + _CHECKPOINT_FOR_DOC, + FlaxQuestionAnsweringModelOutput, + _CONFIG_FOR_DOC, +) + + +__all__ = [ + "FlaxDistilBertForMaskedLM", + "FlaxDistilBertForMultipleChoice", + "FlaxDistilBertForQuestionAnswering", + "FlaxDistilBertForSequenceClassification", + "FlaxDistilBertForTokenClassification", + "FlaxDistilBertModel", + "FlaxDistilBertPreTrainedModel", +] diff --git a/transformers/src/transformers/models/distilbert/modeling_tf_distilbert.py b/transformers/src/transformers/models/distilbert/modeling_tf_distilbert.py new file mode 100644 index 0000000000000000000000000000000000000000..8a72ebf44367f4ff0ee7fd8dfcbda6657d175a1e --- /dev/null +++ b/transformers/src/transformers/models/distilbert/modeling_tf_distilbert.py @@ -0,0 +1,1147 @@ +# coding=utf-8 +# Copyright 2019-present, the HuggingFace Inc. team, The Google AI Language Team and Facebook, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +TF 2.0 DistilBERT model +""" + +from __future__ import annotations + +import warnings +from typing import Optional, Union + +import numpy as np +import tensorflow as tf + +from ...activations_tf import get_tf_activation +from ...modeling_tf_outputs import ( + TFBaseModelOutput, + TFMaskedLMOutput, + TFMultipleChoiceModelOutput, + TFQuestionAnsweringModelOutput, + TFSequenceClassifierOutput, + TFTokenClassifierOutput, +) +from ...modeling_tf_utils import ( + TFMaskedLanguageModelingLoss, + TFModelInputType, + TFMultipleChoiceLoss, + TFPreTrainedModel, + TFQuestionAnsweringLoss, + TFSequenceClassificationLoss, + TFTokenClassificationLoss, + get_initializer, + keras, + keras_serializable, + unpack_inputs, +) +from ...tf_utils import check_embeddings_within_bounds, shape_list, stable_softmax +from ...utils import ( + add_code_sample_docstrings, + add_start_docstrings, + add_start_docstrings_to_model_forward, + logging, +) +from .configuration_distilbert import DistilBertConfig + + +logger = logging.get_logger(__name__) + +_CHECKPOINT_FOR_DOC = "distilbert-base-uncased" +_CONFIG_FOR_DOC = "DistilBertConfig" + + +class TFEmbeddings(keras.layers.Layer): + """Construct the embeddings from word, position and token_type embeddings.""" + + def __init__(self, config, **kwargs): + super().__init__(**kwargs) + self.config = config + self.dim = config.dim + self.initializer_range = config.initializer_range + self.max_position_embeddings = config.max_position_embeddings + self.LayerNorm = keras.layers.LayerNormalization(epsilon=1e-12, name="LayerNorm") + self.dropout = keras.layers.Dropout(rate=config.dropout) + + def build(self, input_shape=None): + with tf.name_scope("word_embeddings"): + self.weight = self.add_weight( + name="weight", + shape=[self.config.vocab_size, self.dim], + initializer=get_initializer(initializer_range=self.initializer_range), + ) + + with tf.name_scope("position_embeddings"): + self.position_embeddings = self.add_weight( + name="embeddings", + shape=[self.max_position_embeddings, self.dim], + initializer=get_initializer(initializer_range=self.initializer_range), + ) + + if self.built: + return + self.built = True + if getattr(self, "LayerNorm", None) is not None: + with tf.name_scope(self.LayerNorm.name): + self.LayerNorm.build([None, None, self.config.dim]) + + def call(self, input_ids=None, position_ids=None, inputs_embeds=None, training=False): + """ + Applies embedding based on inputs tensor. + + Returns: + final_embeddings (`tf.Tensor`): output embedding tensor. + """ + assert not (input_ids is None and inputs_embeds is None) + + if input_ids is not None: + check_embeddings_within_bounds(input_ids, self.config.vocab_size) + inputs_embeds = tf.gather(params=self.weight, indices=input_ids) + + input_shape = shape_list(inputs_embeds)[:-1] + + if position_ids is None: + position_ids = tf.expand_dims(tf.range(start=0, limit=input_shape[-1]), axis=0) + + position_embeds = tf.gather(params=self.position_embeddings, indices=position_ids) + final_embeddings = inputs_embeds + position_embeds + final_embeddings = self.LayerNorm(inputs=final_embeddings) + final_embeddings = self.dropout(inputs=final_embeddings, training=training) + + return final_embeddings + + +class TFMultiHeadSelfAttention(keras.layers.Layer): + def __init__(self, config, **kwargs): + super().__init__(**kwargs) + + self.n_heads = config.n_heads + self.dim = config.dim + self.dropout = keras.layers.Dropout(config.attention_dropout) + self.output_attentions = config.output_attentions + + assert self.dim % self.n_heads == 0, f"Hidden size {self.dim} not dividable by number of heads {self.n_heads}" + + self.q_lin = keras.layers.Dense( + config.dim, kernel_initializer=get_initializer(config.initializer_range), name="q_lin" + ) + self.k_lin = keras.layers.Dense( + config.dim, kernel_initializer=get_initializer(config.initializer_range), name="k_lin" + ) + self.v_lin = keras.layers.Dense( + config.dim, kernel_initializer=get_initializer(config.initializer_range), name="v_lin" + ) + self.out_lin = keras.layers.Dense( + config.dim, kernel_initializer=get_initializer(config.initializer_range), name="out_lin" + ) + + self.pruned_heads = set() + self.config = config + + def prune_heads(self, heads): + raise NotImplementedError + + def call(self, query, key, value, mask, head_mask, output_attentions, training=False): + """ + Parameters: + query: tf.Tensor(bs, seq_length, dim) + key: tf.Tensor(bs, seq_length, dim) + value: tf.Tensor(bs, seq_length, dim) + mask: tf.Tensor(bs, seq_length) + + Returns: + weights: tf.Tensor(bs, n_heads, seq_length, seq_length) Attention weights context: tf.Tensor(bs, + seq_length, dim) Contextualized layer. Optional: only if `output_attentions=True` + """ + bs, q_length, dim = shape_list(query) + k_length = shape_list(key)[1] + # assert dim == self.dim, f'Dimensions do not match: {dim} input vs {self.dim} configured' + # assert key.size() == value.size() + dim_per_head = int(self.dim / self.n_heads) + dim_per_head = tf.cast(dim_per_head, dtype=tf.int32) + mask_reshape = [bs, 1, 1, k_length] + + def shape(x): + """separate heads""" + return tf.transpose(tf.reshape(x, (bs, -1, self.n_heads, dim_per_head)), perm=(0, 2, 1, 3)) + + def unshape(x): + """group heads""" + return tf.reshape(tf.transpose(x, perm=(0, 2, 1, 3)), (bs, -1, self.n_heads * dim_per_head)) + + q = shape(self.q_lin(query)) # (bs, n_heads, q_length, dim_per_head) + k = shape(self.k_lin(key)) # (bs, n_heads, k_length, dim_per_head) + v = shape(self.v_lin(value)) # (bs, n_heads, k_length, dim_per_head) + q = tf.cast(q, dtype=tf.float32) + q = tf.multiply(q, tf.math.rsqrt(tf.cast(dim_per_head, dtype=tf.float32))) + k = tf.cast(k, dtype=q.dtype) + scores = tf.matmul(q, k, transpose_b=True) # (bs, n_heads, q_length, k_length) + mask = tf.reshape(mask, mask_reshape) # (bs, n_heads, qlen, klen) + # scores.masked_fill_(mask, -float('inf')) # (bs, n_heads, q_length, k_length) + + mask = tf.cast(mask, dtype=scores.dtype) + scores = scores - 1e30 * (1.0 - mask) + weights = stable_softmax(scores, axis=-1) # (bs, n_heads, qlen, klen) + weights = self.dropout(weights, training=training) # (bs, n_heads, qlen, klen) + + # Mask heads if we want to + if head_mask is not None: + weights = weights * head_mask + + context = tf.matmul(weights, v) # (bs, n_heads, qlen, dim_per_head) + context = unshape(context) # (bs, q_length, dim) + context = self.out_lin(context) # (bs, q_length, dim) + + if output_attentions: + return (context, weights) + else: + return (context,) + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "q_lin", None) is not None: + with tf.name_scope(self.q_lin.name): + self.q_lin.build([None, None, self.config.dim]) + if getattr(self, "k_lin", None) is not None: + with tf.name_scope(self.k_lin.name): + self.k_lin.build([None, None, self.config.dim]) + if getattr(self, "v_lin", None) is not None: + with tf.name_scope(self.v_lin.name): + self.v_lin.build([None, None, self.config.dim]) + if getattr(self, "out_lin", None) is not None: + with tf.name_scope(self.out_lin.name): + self.out_lin.build([None, None, self.config.dim]) + + +class TFFFN(keras.layers.Layer): + def __init__(self, config, **kwargs): + super().__init__(**kwargs) + self.dropout = keras.layers.Dropout(config.dropout) + self.lin1 = keras.layers.Dense( + config.hidden_dim, kernel_initializer=get_initializer(config.initializer_range), name="lin1" + ) + self.lin2 = keras.layers.Dense( + config.dim, kernel_initializer=get_initializer(config.initializer_range), name="lin2" + ) + self.activation = get_tf_activation(config.activation) + self.config = config + + def call(self, input, training=False): + x = self.lin1(input) + x = self.activation(x) + x = self.lin2(x) + x = self.dropout(x, training=training) + return x + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "lin1", None) is not None: + with tf.name_scope(self.lin1.name): + self.lin1.build([None, None, self.config.dim]) + if getattr(self, "lin2", None) is not None: + with tf.name_scope(self.lin2.name): + self.lin2.build([None, None, self.config.hidden_dim]) + + +class TFTransformerBlock(keras.layers.Layer): + def __init__(self, config, **kwargs): + super().__init__(**kwargs) + + self.n_heads = config.n_heads + self.dim = config.dim + self.hidden_dim = config.hidden_dim + self.dropout = keras.layers.Dropout(config.dropout) + self.activation = config.activation + self.output_attentions = config.output_attentions + + assert config.dim % config.n_heads == 0, ( + f"Hidden size {config.dim} not dividable by number of heads {config.n_heads}" + ) + + self.attention = TFMultiHeadSelfAttention(config, name="attention") + self.sa_layer_norm = keras.layers.LayerNormalization(epsilon=1e-12, name="sa_layer_norm") + + self.ffn = TFFFN(config, name="ffn") + self.output_layer_norm = keras.layers.LayerNormalization(epsilon=1e-12, name="output_layer_norm") + self.config = config + + def call(self, x, attn_mask, head_mask, output_attentions, training=False): # removed: src_enc=None, src_len=None + """ + Parameters: + x: tf.Tensor(bs, seq_length, dim) + attn_mask: tf.Tensor(bs, seq_length) + + Outputs: sa_weights: tf.Tensor(bs, n_heads, seq_length, seq_length) The attention weights ffn_output: + tf.Tensor(bs, seq_length, dim) The output of the transformer block contextualization. + """ + # Self-Attention + sa_output = self.attention(x, x, x, attn_mask, head_mask, output_attentions, training=training) + if output_attentions: + sa_output, sa_weights = sa_output # (bs, seq_length, dim), (bs, n_heads, seq_length, seq_length) + else: # To handle these `output_attentions` or `output_hidden_states` cases returning tuples + # assert type(sa_output) == tuple + sa_output = sa_output[0] + sa_output = self.sa_layer_norm(sa_output + x) # (bs, seq_length, dim) + + # Feed Forward Network + ffn_output = self.ffn(sa_output, training=training) # (bs, seq_length, dim) + ffn_output = self.output_layer_norm(ffn_output + sa_output) # (bs, seq_length, dim) + + output = (ffn_output,) + if output_attentions: + output = (sa_weights,) + output + return output + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "attention", None) is not None: + with tf.name_scope(self.attention.name): + self.attention.build(None) + if getattr(self, "sa_layer_norm", None) is not None: + with tf.name_scope(self.sa_layer_norm.name): + self.sa_layer_norm.build([None, None, self.config.dim]) + if getattr(self, "ffn", None) is not None: + with tf.name_scope(self.ffn.name): + self.ffn.build(None) + if getattr(self, "output_layer_norm", None) is not None: + with tf.name_scope(self.output_layer_norm.name): + self.output_layer_norm.build([None, None, self.config.dim]) + + +class TFTransformer(keras.layers.Layer): + def __init__(self, config, **kwargs): + super().__init__(**kwargs) + self.n_layers = config.n_layers + self.output_hidden_states = config.output_hidden_states + self.output_attentions = config.output_attentions + + self.layer = [TFTransformerBlock(config, name=f"layer_._{i}") for i in range(config.n_layers)] + + def call(self, x, attn_mask, head_mask, output_attentions, output_hidden_states, return_dict, training=False): + # docstyle-ignore + """ + Parameters: + x: tf.Tensor(bs, seq_length, dim) Input sequence embedded. + attn_mask: tf.Tensor(bs, seq_length) Attention mask on the sequence. + + Returns: + hidden_state: tf.Tensor(bs, seq_length, dim) + Sequence of hidden states in the last (top) layer + all_hidden_states: tuple[tf.Tensor(bs, seq_length, dim)] + Tuple of length n_layers with the hidden states from each layer. + Optional: only if output_hidden_states=True + all_attentions: tuple[tf.Tensor(bs, n_heads, seq_length, seq_length)] + Tuple of length n_layers with the attention weights from each layer + Optional: only if output_attentions=True + """ + all_hidden_states = () if output_hidden_states else None + all_attentions = () if output_attentions else None + + hidden_state = x + for i, layer_module in enumerate(self.layer): + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_state,) + + layer_outputs = layer_module(hidden_state, attn_mask, head_mask[i], output_attentions, training=training) + hidden_state = layer_outputs[-1] + + if output_attentions: + assert len(layer_outputs) == 2 + attentions = layer_outputs[0] + all_attentions = all_attentions + (attentions,) + else: + assert len(layer_outputs) == 1, f"Incorrect number of outputs {len(layer_outputs)} instead of 1" + + # Add last layer + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_state,) + + if not return_dict: + return tuple(v for v in [hidden_state, all_hidden_states, all_attentions] if v is not None) + return TFBaseModelOutput( + last_hidden_state=hidden_state, hidden_states=all_hidden_states, attentions=all_attentions + ) + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "layer", None) is not None: + for layer in self.layer: + with tf.name_scope(layer.name): + layer.build(None) + + +@keras_serializable +class TFDistilBertMainLayer(keras.layers.Layer): + config_class = DistilBertConfig + + def __init__(self, config, **kwargs): + super().__init__(**kwargs) + + self.config = config + self.num_hidden_layers = config.num_hidden_layers + self.output_attentions = config.output_attentions + self.output_hidden_states = config.output_hidden_states + self.return_dict = config.use_return_dict + + self.embeddings = TFEmbeddings(config, name="embeddings") # Embeddings + self.transformer = TFTransformer(config, name="transformer") # Encoder + + def get_input_embeddings(self): + return self.embeddings + + def set_input_embeddings(self, value): + self.embeddings.weight = value + self.embeddings.vocab_size = value.shape[0] + + def _prune_heads(self, heads_to_prune): + raise NotImplementedError + + @unpack_inputs + def call( + self, + input_ids=None, + attention_mask=None, + head_mask=None, + inputs_embeds=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + training=False, + ): + if input_ids is not None and inputs_embeds is not None: + raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") + elif input_ids is not None: + input_shape = shape_list(input_ids) + elif inputs_embeds is not None: + input_shape = shape_list(inputs_embeds)[:-1] + else: + raise ValueError("You have to specify either input_ids or inputs_embeds") + + if attention_mask is None: + attention_mask = tf.ones(input_shape) # (bs, seq_length) + + attention_mask = tf.cast(attention_mask, dtype=tf.float32) + + # Prepare head mask if needed + # 1.0 in head_mask indicate we keep the head + # attention_probs has shape bsz x n_heads x N x N + # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads] + # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length] + if head_mask is not None: + raise NotImplementedError + else: + head_mask = [None] * self.num_hidden_layers + + embedding_output = self.embeddings(input_ids, inputs_embeds=inputs_embeds) # (bs, seq_length, dim) + tfmr_output = self.transformer( + embedding_output, + attention_mask, + head_mask, + output_attentions, + output_hidden_states, + return_dict, + training=training, + ) + + return tfmr_output # last-layer hidden-state, (all hidden_states), (all attentions) + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "embeddings", None) is not None: + with tf.name_scope(self.embeddings.name): + self.embeddings.build(None) + if getattr(self, "transformer", None) is not None: + with tf.name_scope(self.transformer.name): + self.transformer.build(None) + + +# INTERFACE FOR ENCODER AND TASK SPECIFIC MODEL # +class TFDistilBertPreTrainedModel(TFPreTrainedModel): + """ + An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained + models. + """ + + config_class = DistilBertConfig + base_model_prefix = "distilbert" + + +DISTILBERT_START_DOCSTRING = r""" + + This model inherits from [`TFPreTrainedModel`]. Check the superclass documentation for the generic methods the + library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads + etc.) + + This model is also a [keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it + as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and + behavior. + + + + TensorFlow models and layers in `transformers` accept two formats as input: + + - having all inputs as keyword arguments (like PyTorch models), or + - having all inputs as a list, tuple or dict in the first positional argument. + + The reason the second format is supported is that Keras methods prefer this format when passing inputs to models + and layers. Because of this support, when using methods like `model.fit()` things should "just work" for you - just + pass your inputs and labels in any format that `model.fit()` supports! If, however, you want to use the second + format outside of Keras methods like `fit()` and `predict()`, such as when creating your own layers or models with + the Keras `Functional` API, there are three possibilities you can use to gather all the input Tensors in the first + positional argument: + + - a single Tensor with `input_ids` only and nothing else: `model(input_ids)` + - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring: + `model([input_ids, attention_mask])` or `model([input_ids, attention_mask, token_type_ids])` + - a dictionary with one or several input Tensors associated to the input names given in the docstring: + `model({"input_ids": input_ids, "token_type_ids": token_type_ids})` + + Note that when creating models and layers with + [subclassing](https://keras.io/guides/making_new_layers_and_models_via_subclassing/) then you don't need to worry + about any of this, as you can just pass inputs like you would to any other Python function! + + + + Parameters: + config ([`DistilBertConfig`]): Model configuration class with all the parameters of the model. + Initializing with a config file does not load the weights associated with the model, only the + configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights. +""" + +DISTILBERT_INPUTS_DOCSTRING = r""" + Args: + input_ids (`Numpy array` or `tf.Tensor` of shape `({0})`): + Indices of input sequence tokens in the vocabulary. + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.__call__`] and + [`PreTrainedTokenizer.encode`] for details. + + [What are input IDs?](../glossary#input-ids) + attention_mask (`Numpy array` or `tf.Tensor` of shape `({0})`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + head_mask (`Numpy array` or `tf.Tensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*): + Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`: + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + + inputs_embeds (`tf.Tensor` of shape `({0}, hidden_size)`, *optional*): + Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This + is useful if you want more control over how to convert `input_ids` indices into associated vectors than the + model's internal embedding lookup matrix. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned + tensors for more detail. This argument can be used only in eager mode, in graph mode the value in the + config will be used instead. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. This argument can be used only in eager mode, in graph mode the value in the config will be + used instead. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. This argument can be used in + eager mode, in graph mode the value will always be set to True. + training (`bool`, *optional*, defaults to `False`): + Whether or not to use the model in training mode (some modules like dropout modules have different + behaviors between training and evaluation). +""" + + +@add_start_docstrings( + "The bare DistilBERT encoder/transformer outputting raw hidden-states without any specific head on top.", + DISTILBERT_START_DOCSTRING, +) +class TFDistilBertModel(TFDistilBertPreTrainedModel): + def __init__(self, config, *inputs, **kwargs): + super().__init__(config, *inputs, **kwargs) + self.distilbert = TFDistilBertMainLayer(config, name="distilbert") # Embeddings + + @unpack_inputs + @add_start_docstrings_to_model_forward(DISTILBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) + @add_code_sample_docstrings( + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=TFBaseModelOutput, + config_class=_CONFIG_FOR_DOC, + ) + def call( + self, + input_ids: TFModelInputType | None = None, + attention_mask: np.ndarray | tf.Tensor | None = None, + head_mask: np.ndarray | tf.Tensor | None = None, + inputs_embeds: np.ndarray | tf.Tensor | None = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + training: Optional[bool] = False, + ) -> Union[TFBaseModelOutput, tuple[tf.Tensor]]: + outputs = self.distilbert( + input_ids=input_ids, + attention_mask=attention_mask, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + training=training, + ) + return outputs + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "distilbert", None) is not None: + with tf.name_scope(self.distilbert.name): + self.distilbert.build(None) + + +class TFDistilBertLMHead(keras.layers.Layer): + def __init__(self, config, input_embeddings, **kwargs): + super().__init__(**kwargs) + + self.config = config + self.dim = config.dim + + # The output weights are the same as the input embeddings, but there is + # an output-only bias for each token. + self.input_embeddings = input_embeddings + + def build(self, input_shape): + self.bias = self.add_weight(shape=(self.config.vocab_size,), initializer="zeros", trainable=True, name="bias") + + super().build(input_shape) + + def get_output_embeddings(self): + return self.input_embeddings + + def set_output_embeddings(self, value): + self.input_embeddings.weight = value + self.input_embeddings.vocab_size = shape_list(value)[0] + + def get_bias(self): + return {"bias": self.bias} + + def set_bias(self, value): + self.bias = value["bias"] + self.config.vocab_size = shape_list(value["bias"])[0] + + def call(self, hidden_states): + seq_length = shape_list(tensor=hidden_states)[1] + hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, self.dim]) + hidden_states = tf.matmul(a=hidden_states, b=self.input_embeddings.weight, transpose_b=True) + hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, seq_length, self.config.vocab_size]) + hidden_states = tf.nn.bias_add(value=hidden_states, bias=self.bias) + + return hidden_states + + +@add_start_docstrings( + """DistilBert Model with a `masked language modeling` head on top.""", + DISTILBERT_START_DOCSTRING, +) +class TFDistilBertForMaskedLM(TFDistilBertPreTrainedModel, TFMaskedLanguageModelingLoss): + def __init__(self, config, *inputs, **kwargs): + super().__init__(config, *inputs, **kwargs) + self.config = config + + self.distilbert = TFDistilBertMainLayer(config, name="distilbert") + self.vocab_transform = keras.layers.Dense( + config.dim, kernel_initializer=get_initializer(config.initializer_range), name="vocab_transform" + ) + self.act = get_tf_activation(config.activation) + self.vocab_layer_norm = keras.layers.LayerNormalization(epsilon=1e-12, name="vocab_layer_norm") + self.vocab_projector = TFDistilBertLMHead(config, self.distilbert.embeddings, name="vocab_projector") + + def get_lm_head(self): + return self.vocab_projector + + def get_prefix_bias_name(self): + warnings.warn("The method get_prefix_bias_name is deprecated. Please use `get_bias` instead.", FutureWarning) + return self.name + "/" + self.vocab_projector.name + + @unpack_inputs + @add_start_docstrings_to_model_forward(DISTILBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) + @add_code_sample_docstrings( + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=TFMaskedLMOutput, + config_class=_CONFIG_FOR_DOC, + ) + def call( + self, + input_ids: TFModelInputType | None = None, + attention_mask: np.ndarray | tf.Tensor | None = None, + head_mask: np.ndarray | tf.Tensor | None = None, + inputs_embeds: np.ndarray | tf.Tensor | None = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + labels: np.ndarray | tf.Tensor | None = None, + training: Optional[bool] = False, + ) -> Union[TFMaskedLMOutput, tuple[tf.Tensor]]: + r""" + labels (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ..., + config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the + loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]` + """ + distilbert_output = self.distilbert( + input_ids=input_ids, + attention_mask=attention_mask, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + training=training, + ) + hidden_states = distilbert_output[0] # (bs, seq_length, dim) + prediction_logits = self.vocab_transform(hidden_states) # (bs, seq_length, dim) + prediction_logits = self.act(prediction_logits) # (bs, seq_length, dim) + prediction_logits = self.vocab_layer_norm(prediction_logits) # (bs, seq_length, dim) + prediction_logits = self.vocab_projector(prediction_logits) + + loss = None if labels is None else self.hf_compute_loss(labels, prediction_logits) + + if not return_dict: + output = (prediction_logits,) + distilbert_output[1:] + return ((loss,) + output) if loss is not None else output + + return TFMaskedLMOutput( + loss=loss, + logits=prediction_logits, + hidden_states=distilbert_output.hidden_states, + attentions=distilbert_output.attentions, + ) + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "distilbert", None) is not None: + with tf.name_scope(self.distilbert.name): + self.distilbert.build(None) + if getattr(self, "vocab_transform", None) is not None: + with tf.name_scope(self.vocab_transform.name): + self.vocab_transform.build([None, None, self.config.dim]) + if getattr(self, "vocab_layer_norm", None) is not None: + with tf.name_scope(self.vocab_layer_norm.name): + self.vocab_layer_norm.build([None, None, self.config.dim]) + if getattr(self, "vocab_projector", None) is not None: + with tf.name_scope(self.vocab_projector.name): + self.vocab_projector.build(None) + + +@add_start_docstrings( + """ + DistilBert Model transformer with a sequence classification/regression head on top (a linear layer on top of the + pooled output) e.g. for GLUE tasks. + """, + DISTILBERT_START_DOCSTRING, +) +class TFDistilBertForSequenceClassification(TFDistilBertPreTrainedModel, TFSequenceClassificationLoss): + def __init__(self, config, *inputs, **kwargs): + super().__init__(config, *inputs, **kwargs) + self.num_labels = config.num_labels + + self.distilbert = TFDistilBertMainLayer(config, name="distilbert") + self.pre_classifier = keras.layers.Dense( + config.dim, + kernel_initializer=get_initializer(config.initializer_range), + activation="relu", + name="pre_classifier", + ) + self.classifier = keras.layers.Dense( + config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier" + ) + self.dropout = keras.layers.Dropout(config.seq_classif_dropout) + self.config = config + + @unpack_inputs + @add_start_docstrings_to_model_forward(DISTILBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) + @add_code_sample_docstrings( + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=TFSequenceClassifierOutput, + config_class=_CONFIG_FOR_DOC, + ) + def call( + self, + input_ids: TFModelInputType | None = None, + attention_mask: np.ndarray | tf.Tensor | None = None, + head_mask: np.ndarray | tf.Tensor | None = None, + inputs_embeds: np.ndarray | tf.Tensor | None = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + labels: np.ndarray | tf.Tensor | None = None, + training: Optional[bool] = False, + ) -> Union[TFSequenceClassifierOutput, tuple[tf.Tensor]]: + r""" + labels (`tf.Tensor` of shape `(batch_size,)`, *optional*): + Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., + config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If + `config.num_labels > 1` a classification loss is computed (Cross-Entropy). + """ + distilbert_output = self.distilbert( + input_ids=input_ids, + attention_mask=attention_mask, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + training=training, + ) + hidden_state = distilbert_output[0] # (bs, seq_len, dim) + pooled_output = hidden_state[:, 0] # (bs, dim) + pooled_output = self.pre_classifier(pooled_output) # (bs, dim) + pooled_output = self.dropout(pooled_output, training=training) # (bs, dim) + logits = self.classifier(pooled_output) # (bs, dim) + + loss = None if labels is None else self.hf_compute_loss(labels, logits) + + if not return_dict: + output = (logits,) + distilbert_output[1:] + return ((loss,) + output) if loss is not None else output + + return TFSequenceClassifierOutput( + loss=loss, + logits=logits, + hidden_states=distilbert_output.hidden_states, + attentions=distilbert_output.attentions, + ) + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "distilbert", None) is not None: + with tf.name_scope(self.distilbert.name): + self.distilbert.build(None) + if getattr(self, "pre_classifier", None) is not None: + with tf.name_scope(self.pre_classifier.name): + self.pre_classifier.build([None, None, self.config.dim]) + if getattr(self, "classifier", None) is not None: + with tf.name_scope(self.classifier.name): + self.classifier.build([None, None, self.config.dim]) + + +@add_start_docstrings( + """ + DistilBert Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. + for Named-Entity-Recognition (NER) tasks. + """, + DISTILBERT_START_DOCSTRING, +) +class TFDistilBertForTokenClassification(TFDistilBertPreTrainedModel, TFTokenClassificationLoss): + def __init__(self, config, *inputs, **kwargs): + super().__init__(config, *inputs, **kwargs) + self.num_labels = config.num_labels + + self.distilbert = TFDistilBertMainLayer(config, name="distilbert") + self.dropout = keras.layers.Dropout(config.dropout) + self.classifier = keras.layers.Dense( + config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier" + ) + self.config = config + + @unpack_inputs + @add_start_docstrings_to_model_forward(DISTILBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) + @add_code_sample_docstrings( + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=TFTokenClassifierOutput, + config_class=_CONFIG_FOR_DOC, + ) + def call( + self, + input_ids: TFModelInputType | None = None, + attention_mask: np.ndarray | tf.Tensor | None = None, + head_mask: np.ndarray | tf.Tensor | None = None, + inputs_embeds: np.ndarray | tf.Tensor | None = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + labels: np.ndarray | tf.Tensor | None = None, + training: Optional[bool] = False, + ) -> Union[TFTokenClassifierOutput, tuple[tf.Tensor]]: + r""" + labels (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`. + """ + outputs = self.distilbert( + input_ids=input_ids, + attention_mask=attention_mask, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + training=training, + ) + sequence_output = outputs[0] + sequence_output = self.dropout(sequence_output, training=training) + logits = self.classifier(sequence_output) + loss = None if labels is None else self.hf_compute_loss(labels, logits) + + if not return_dict: + output = (logits,) + outputs[1:] + return ((loss,) + output) if loss is not None else output + + return TFTokenClassifierOutput( + loss=loss, + logits=logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "distilbert", None) is not None: + with tf.name_scope(self.distilbert.name): + self.distilbert.build(None) + if getattr(self, "classifier", None) is not None: + with tf.name_scope(self.classifier.name): + self.classifier.build([None, None, self.config.hidden_size]) + + +@add_start_docstrings( + """ + DistilBert Model with a multiple choice classification head on top (a linear layer on top of the pooled output and + a softmax) e.g. for RocStories/SWAG tasks. + """, + DISTILBERT_START_DOCSTRING, +) +class TFDistilBertForMultipleChoice(TFDistilBertPreTrainedModel, TFMultipleChoiceLoss): + def __init__(self, config, *inputs, **kwargs): + super().__init__(config, *inputs, **kwargs) + + self.distilbert = TFDistilBertMainLayer(config, name="distilbert") + self.dropout = keras.layers.Dropout(config.seq_classif_dropout) + self.pre_classifier = keras.layers.Dense( + config.dim, + kernel_initializer=get_initializer(config.initializer_range), + activation="relu", + name="pre_classifier", + ) + self.classifier = keras.layers.Dense( + 1, kernel_initializer=get_initializer(config.initializer_range), name="classifier" + ) + self.config = config + + @unpack_inputs + @add_start_docstrings_to_model_forward( + DISTILBERT_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length") + ) + @add_code_sample_docstrings( + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=TFMultipleChoiceModelOutput, + config_class=_CONFIG_FOR_DOC, + ) + def call( + self, + input_ids: TFModelInputType | None = None, + attention_mask: np.ndarray | tf.Tensor | None = None, + head_mask: np.ndarray | tf.Tensor | None = None, + inputs_embeds: np.ndarray | tf.Tensor | None = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + labels: np.ndarray | tf.Tensor | None = None, + training: Optional[bool] = False, + ) -> Union[TFMultipleChoiceModelOutput, tuple[tf.Tensor]]: + r""" + labels (`tf.Tensor` of shape `(batch_size,)`, *optional*): + Labels for computing the multiple choice classification loss. Indices should be in `[0, ..., num_choices]` + where `num_choices` is the size of the second dimension of the input tensors. (See `input_ids` above) + """ + if input_ids is not None: + num_choices = shape_list(input_ids)[1] + seq_length = shape_list(input_ids)[2] + else: + num_choices = shape_list(inputs_embeds)[1] + seq_length = shape_list(inputs_embeds)[2] + + flat_input_ids = tf.reshape(input_ids, (-1, seq_length)) if input_ids is not None else None + flat_attention_mask = tf.reshape(attention_mask, (-1, seq_length)) if attention_mask is not None else None + flat_inputs_embeds = ( + tf.reshape(inputs_embeds, (-1, seq_length, shape_list(inputs_embeds)[3])) + if inputs_embeds is not None + else None + ) + distilbert_output = self.distilbert( + flat_input_ids, + flat_attention_mask, + head_mask, + flat_inputs_embeds, + output_attentions, + output_hidden_states, + return_dict=return_dict, + training=training, + ) + hidden_state = distilbert_output[0] # (bs, seq_len, dim) + pooled_output = hidden_state[:, 0] # (bs, dim) + pooled_output = self.pre_classifier(pooled_output) # (bs, dim) + pooled_output = self.dropout(pooled_output, training=training) # (bs, dim) + logits = self.classifier(pooled_output) + reshaped_logits = tf.reshape(logits, (-1, num_choices)) + + loss = None if labels is None else self.hf_compute_loss(labels, reshaped_logits) + + if not return_dict: + output = (reshaped_logits,) + distilbert_output[1:] + return ((loss,) + output) if loss is not None else output + + return TFMultipleChoiceModelOutput( + loss=loss, + logits=reshaped_logits, + hidden_states=distilbert_output.hidden_states, + attentions=distilbert_output.attentions, + ) + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "distilbert", None) is not None: + with tf.name_scope(self.distilbert.name): + self.distilbert.build(None) + if getattr(self, "pre_classifier", None) is not None: + with tf.name_scope(self.pre_classifier.name): + self.pre_classifier.build([None, None, self.config.dim]) + if getattr(self, "classifier", None) is not None: + with tf.name_scope(self.classifier.name): + self.classifier.build([None, None, self.config.dim]) + + +@add_start_docstrings( + """ + DistilBert Model with a span classification head on top for extractive question-answering tasks like SQuAD (a + linear layer on top of the hidden-states output to compute `span start logits` and `span end logits`). + """, + DISTILBERT_START_DOCSTRING, +) +class TFDistilBertForQuestionAnswering(TFDistilBertPreTrainedModel, TFQuestionAnsweringLoss): + def __init__(self, config, *inputs, **kwargs): + super().__init__(config, *inputs, **kwargs) + + self.distilbert = TFDistilBertMainLayer(config, name="distilbert") + self.qa_outputs = keras.layers.Dense( + config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="qa_outputs" + ) + assert config.num_labels == 2, f"Incorrect number of labels {config.num_labels} instead of 2" + self.dropout = keras.layers.Dropout(config.qa_dropout) + self.config = config + + @unpack_inputs + @add_start_docstrings_to_model_forward(DISTILBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) + @add_code_sample_docstrings( + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=TFQuestionAnsweringModelOutput, + config_class=_CONFIG_FOR_DOC, + ) + def call( + self, + input_ids: TFModelInputType | None = None, + attention_mask: np.ndarray | tf.Tensor | None = None, + head_mask: np.ndarray | tf.Tensor | None = None, + inputs_embeds: np.ndarray | tf.Tensor | None = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + start_positions: np.ndarray | tf.Tensor | None = None, + end_positions: np.ndarray | tf.Tensor | None = None, + training: Optional[bool] = False, + ) -> Union[TFQuestionAnsweringModelOutput, tuple[tf.Tensor]]: + r""" + start_positions (`tf.Tensor` of shape `(batch_size,)`, *optional*): + Labels for position (index) of the start of the labelled span for computing the token classification loss. + Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence + are not taken into account for computing the loss. + end_positions (`tf.Tensor` of shape `(batch_size,)`, *optional*): + Labels for position (index) of the end of the labelled span for computing the token classification loss. + Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence + are not taken into account for computing the loss. + """ + distilbert_output = self.distilbert( + input_ids=input_ids, + attention_mask=attention_mask, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + training=training, + ) + hidden_states = distilbert_output[0] # (bs, max_query_len, dim) + hidden_states = self.dropout(hidden_states, training=training) # (bs, max_query_len, dim) + logits = self.qa_outputs(hidden_states) # (bs, max_query_len, 2) + start_logits, end_logits = tf.split(logits, 2, axis=-1) + start_logits = tf.squeeze(start_logits, axis=-1) + end_logits = tf.squeeze(end_logits, axis=-1) + + loss = None + if start_positions is not None and end_positions is not None: + labels = {"start_position": start_positions} + labels["end_position"] = end_positions + loss = self.hf_compute_loss(labels, (start_logits, end_logits)) + + if not return_dict: + output = (start_logits, end_logits) + distilbert_output[1:] + return ((loss,) + output) if loss is not None else output + + return TFQuestionAnsweringModelOutput( + loss=loss, + start_logits=start_logits, + end_logits=end_logits, + hidden_states=distilbert_output.hidden_states, + attentions=distilbert_output.attentions, + ) + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "distilbert", None) is not None: + with tf.name_scope(self.distilbert.name): + self.distilbert.build(None) + if getattr(self, "qa_outputs", None) is not None: + with tf.name_scope(self.qa_outputs.name): + self.qa_outputs.build([None, None, self.config.dim]) + + +__all__ = [ + "TFDistilBertForMaskedLM", + "TFDistilBertForMultipleChoice", + "TFDistilBertForQuestionAnswering", + "TFDistilBertForSequenceClassification", + "TFDistilBertForTokenClassification", + "TFDistilBertMainLayer", + "TFDistilBertModel", + "TFDistilBertPreTrainedModel", +] diff --git a/transformers/src/transformers/models/distilbert/tokenization_distilbert.py b/transformers/src/transformers/models/distilbert/tokenization_distilbert.py new file mode 100644 index 0000000000000000000000000000000000000000..d2a629052f0345fb8943f7cd5300f6706e4b3314 --- /dev/null +++ b/transformers/src/transformers/models/distilbert/tokenization_distilbert.py @@ -0,0 +1,492 @@ +# coding=utf-8 +# Copyright 2018 The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Tokenization classes for DistilBERT.""" + +import collections +import os +import unicodedata +from typing import Optional + +from ...tokenization_utils import PreTrainedTokenizer, _is_control, _is_punctuation, _is_whitespace +from ...utils import logging + + +logger = logging.get_logger(__name__) + +VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt"} + + +# Copied from transformers.models.bert.tokenization_bert.load_vocab +def load_vocab(vocab_file): + """Loads a vocabulary file into a dictionary.""" + vocab = collections.OrderedDict() + with open(vocab_file, "r", encoding="utf-8") as reader: + tokens = reader.readlines() + for index, token in enumerate(tokens): + token = token.rstrip("\n") + vocab[token] = index + return vocab + + +# Copied from transformers.models.bert.tokenization_bert.whitespace_tokenize +def whitespace_tokenize(text): + """Runs basic whitespace cleaning and splitting on a piece of text.""" + text = text.strip() + if not text: + return [] + tokens = text.split() + return tokens + + +class DistilBertTokenizer(PreTrainedTokenizer): + r""" + Construct a DistilBERT tokenizer. Based on WordPiece. + + This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to + this superclass for more information regarding those methods. + + Args: + vocab_file (`str`): + File containing the vocabulary. + do_lower_case (`bool`, *optional*, defaults to `True`): + Whether or not to lowercase the input when tokenizing. + do_basic_tokenize (`bool`, *optional*, defaults to `True`): + Whether or not to do basic tokenization before WordPiece. + never_split (`Iterable`, *optional*): + Collection of tokens which will never be split during tokenization. Only has an effect when + `do_basic_tokenize=True` + unk_token (`str`, *optional*, defaults to `"[UNK]"`): + The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this + token instead. + sep_token (`str`, *optional*, defaults to `"[SEP]"`): + The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for + sequence classification or for a text and a question for question answering. It is also used as the last + token of a sequence built with special tokens. + pad_token (`str`, *optional*, defaults to `"[PAD]"`): + The token used for padding, for example when batching sequences of different lengths. + cls_token (`str`, *optional*, defaults to `"[CLS]"`): + The classifier token which is used when doing sequence classification (classification of the whole sequence + instead of per-token classification). It is the first token of the sequence when built with special tokens. + mask_token (`str`, *optional*, defaults to `"[MASK]"`): + The token used for masking values. This is the token used when training this model with masked language + modeling. This is the token which the model will try to predict. + tokenize_chinese_chars (`bool`, *optional*, defaults to `True`): + Whether or not to tokenize Chinese characters. + + This should likely be deactivated for Japanese (see this + [issue](https://github.com/huggingface/transformers/issues/328)). + strip_accents (`bool`, *optional*): + Whether or not to strip all accents. If this option is not specified, then it will be determined by the + value for `lowercase` (as in the original BERT). + clean_up_tokenization_spaces (`bool`, *optional*, defaults to `True`): + Whether or not to cleanup spaces after decoding, cleanup consists in removing potential artifacts like + extra spaces. + """ + + vocab_files_names = VOCAB_FILES_NAMES + model_input_names = ["input_ids", "attention_mask"] + + def __init__( + self, + vocab_file, + do_lower_case=True, + do_basic_tokenize=True, + never_split=None, + unk_token="[UNK]", + sep_token="[SEP]", + pad_token="[PAD]", + cls_token="[CLS]", + mask_token="[MASK]", + tokenize_chinese_chars=True, + strip_accents=None, + clean_up_tokenization_spaces=True, + **kwargs, + ): + if not os.path.isfile(vocab_file): + raise ValueError( + f"Can't find a vocabulary file at path '{vocab_file}'. To load the vocabulary from a Google pretrained" + " model use `tokenizer = DistilBertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`" + ) + self.vocab = load_vocab(vocab_file) + self.ids_to_tokens = collections.OrderedDict([(ids, tok) for tok, ids in self.vocab.items()]) + self.do_basic_tokenize = do_basic_tokenize + if do_basic_tokenize: + self.basic_tokenizer = BasicTokenizer( + do_lower_case=do_lower_case, + never_split=never_split, + tokenize_chinese_chars=tokenize_chinese_chars, + strip_accents=strip_accents, + ) + self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=str(unk_token)) + + super().__init__( + do_lower_case=do_lower_case, + do_basic_tokenize=do_basic_tokenize, + never_split=never_split, + unk_token=unk_token, + sep_token=sep_token, + pad_token=pad_token, + cls_token=cls_token, + mask_token=mask_token, + tokenize_chinese_chars=tokenize_chinese_chars, + strip_accents=strip_accents, + clean_up_tokenization_spaces=clean_up_tokenization_spaces, + **kwargs, + ) + + @property + # Copied from transformers.models.bert.tokenization_bert.BertTokenizer.do_lower_case + def do_lower_case(self): + return self.basic_tokenizer.do_lower_case + + @property + # Copied from transformers.models.bert.tokenization_bert.BertTokenizer.vocab_size + def vocab_size(self): + return len(self.vocab) + + # Copied from transformers.models.bert.tokenization_bert.BertTokenizer.get_vocab + def get_vocab(self): + return dict(self.vocab, **self.added_tokens_encoder) + + # Copied from transformers.models.bert.tokenization_bert.BertTokenizer._tokenize + def _tokenize(self, text, split_special_tokens=False): + split_tokens = [] + if self.do_basic_tokenize: + for token in self.basic_tokenizer.tokenize( + text, never_split=self.all_special_tokens if not split_special_tokens else None + ): + # If the token is part of the never_split set + if token in self.basic_tokenizer.never_split: + split_tokens.append(token) + else: + split_tokens += self.wordpiece_tokenizer.tokenize(token) + else: + split_tokens = self.wordpiece_tokenizer.tokenize(text) + return split_tokens + + # Copied from transformers.models.bert.tokenization_bert.BertTokenizer._convert_token_to_id + def _convert_token_to_id(self, token): + """Converts a token (str) in an id using the vocab.""" + return self.vocab.get(token, self.vocab.get(self.unk_token)) + + # Copied from transformers.models.bert.tokenization_bert.BertTokenizer._convert_id_to_token + def _convert_id_to_token(self, index): + """Converts an index (integer) in a token (str) using the vocab.""" + return self.ids_to_tokens.get(index, self.unk_token) + + # Copied from transformers.models.bert.tokenization_bert.BertTokenizer.convert_tokens_to_string + def convert_tokens_to_string(self, tokens): + """Converts a sequence of tokens (string) in a single string.""" + out_string = " ".join(tokens).replace(" ##", "").strip() + return out_string + + # Copied from transformers.models.bert.tokenization_bert.BertTokenizer.build_inputs_with_special_tokens + def build_inputs_with_special_tokens( + self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None + ) -> list[int]: + """ + Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and + adding special tokens. A BERT sequence has the following format: + + - single sequence: `[CLS] X [SEP]` + - pair of sequences: `[CLS] A [SEP] B [SEP]` + + Args: + token_ids_0 (`List[int]`): + List of IDs to which the special tokens will be added. + token_ids_1 (`List[int]`, *optional*): + Optional second list of IDs for sequence pairs. + + Returns: + `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens. + """ + if token_ids_1 is None: + return [self.cls_token_id] + token_ids_0 + [self.sep_token_id] + cls = [self.cls_token_id] + sep = [self.sep_token_id] + return cls + token_ids_0 + sep + token_ids_1 + sep + + # Copied from transformers.models.bert.tokenization_bert.BertTokenizer.get_special_tokens_mask + def get_special_tokens_mask( + self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None, already_has_special_tokens: bool = False + ) -> list[int]: + """ + Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding + special tokens using the tokenizer `prepare_for_model` method. + + Args: + token_ids_0 (`List[int]`): + List of IDs. + token_ids_1 (`List[int]`, *optional*): + Optional second list of IDs for sequence pairs. + already_has_special_tokens (`bool`, *optional*, defaults to `False`): + Whether or not the token list is already formatted with special tokens for the model. + + Returns: + `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token. + """ + + if already_has_special_tokens: + return super().get_special_tokens_mask( + token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True + ) + + if token_ids_1 is not None: + return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1] + return [1] + ([0] * len(token_ids_0)) + [1] + + # Copied from transformers.models.bert.tokenization_bert.BertTokenizer.save_vocabulary + def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]: + index = 0 + if os.path.isdir(save_directory): + vocab_file = os.path.join( + save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"] + ) + else: + vocab_file = (filename_prefix + "-" if filename_prefix else "") + save_directory + with open(vocab_file, "w", encoding="utf-8") as writer: + for token, token_index in sorted(self.vocab.items(), key=lambda kv: kv[1]): + if index != token_index: + logger.warning( + f"Saving vocabulary to {vocab_file}: vocabulary indices are not consecutive." + " Please check that the vocabulary is not corrupted!" + ) + index = token_index + writer.write(token + "\n") + index += 1 + return (vocab_file,) + + +# Copied from transformers.models.bert.tokenization_bert.BasicTokenizer +class BasicTokenizer: + """ + Constructs a BasicTokenizer that will run basic tokenization (punctuation splitting, lower casing, etc.). + + Args: + do_lower_case (`bool`, *optional*, defaults to `True`): + Whether or not to lowercase the input when tokenizing. + never_split (`Iterable`, *optional*): + Collection of tokens which will never be split during tokenization. Only has an effect when + `do_basic_tokenize=True` + tokenize_chinese_chars (`bool`, *optional*, defaults to `True`): + Whether or not to tokenize Chinese characters. + + This should likely be deactivated for Japanese (see this + [issue](https://github.com/huggingface/transformers/issues/328)). + strip_accents (`bool`, *optional*): + Whether or not to strip all accents. If this option is not specified, then it will be determined by the + value for `lowercase` (as in the original BERT). + do_split_on_punc (`bool`, *optional*, defaults to `True`): + In some instances we want to skip the basic punctuation splitting so that later tokenization can capture + the full context of the words, such as contractions. + """ + + def __init__( + self, + do_lower_case=True, + never_split=None, + tokenize_chinese_chars=True, + strip_accents=None, + do_split_on_punc=True, + ): + if never_split is None: + never_split = [] + self.do_lower_case = do_lower_case + self.never_split = set(never_split) + self.tokenize_chinese_chars = tokenize_chinese_chars + self.strip_accents = strip_accents + self.do_split_on_punc = do_split_on_punc + + def tokenize(self, text, never_split=None): + """ + Basic Tokenization of a piece of text. For sub-word tokenization, see WordPieceTokenizer. + + Args: + never_split (`List[str]`, *optional*) + Kept for backward compatibility purposes. Now implemented directly at the base class level (see + [`PreTrainedTokenizer.tokenize`]) List of token not to split. + """ + # union() returns a new set by concatenating the two sets. + never_split = self.never_split.union(set(never_split)) if never_split else self.never_split + text = self._clean_text(text) + + # This was added on November 1st, 2018 for the multilingual and Chinese + # models. This is also applied to the English models now, but it doesn't + # matter since the English models were not trained on any Chinese data + # and generally don't have any Chinese data in them (there are Chinese + # characters in the vocabulary because Wikipedia does have some Chinese + # words in the English Wikipedia.). + if self.tokenize_chinese_chars: + text = self._tokenize_chinese_chars(text) + # prevents treating the same character with different unicode codepoints as different characters + unicode_normalized_text = unicodedata.normalize("NFC", text) + orig_tokens = whitespace_tokenize(unicode_normalized_text) + split_tokens = [] + for token in orig_tokens: + if token not in never_split: + if self.do_lower_case: + token = token.lower() + if self.strip_accents is not False: + token = self._run_strip_accents(token) + elif self.strip_accents: + token = self._run_strip_accents(token) + split_tokens.extend(self._run_split_on_punc(token, never_split)) + + output_tokens = whitespace_tokenize(" ".join(split_tokens)) + return output_tokens + + def _run_strip_accents(self, text): + """Strips accents from a piece of text.""" + text = unicodedata.normalize("NFD", text) + output = [] + for char in text: + cat = unicodedata.category(char) + if cat == "Mn": + continue + output.append(char) + return "".join(output) + + def _run_split_on_punc(self, text, never_split=None): + """Splits punctuation on a piece of text.""" + if not self.do_split_on_punc or (never_split is not None and text in never_split): + return [text] + chars = list(text) + i = 0 + start_new_word = True + output = [] + while i < len(chars): + char = chars[i] + if _is_punctuation(char): + output.append([char]) + start_new_word = True + else: + if start_new_word: + output.append([]) + start_new_word = False + output[-1].append(char) + i += 1 + + return ["".join(x) for x in output] + + def _tokenize_chinese_chars(self, text): + """Adds whitespace around any CJK character.""" + output = [] + for char in text: + cp = ord(char) + if self._is_chinese_char(cp): + output.append(" ") + output.append(char) + output.append(" ") + else: + output.append(char) + return "".join(output) + + def _is_chinese_char(self, cp): + """Checks whether CP is the codepoint of a CJK character.""" + # This defines a "chinese character" as anything in the CJK Unicode block: + # https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block) + # + # Note that the CJK Unicode block is NOT all Japanese and Korean characters, + # despite its name. The modern Korean Hangul alphabet is a different block, + # as is Japanese Hiragana and Katakana. Those alphabets are used to write + # space-separated words, so they are not treated specially and handled + # like the all of the other languages. + if ( + (cp >= 0x4E00 and cp <= 0x9FFF) + or (cp >= 0x3400 and cp <= 0x4DBF) # + or (cp >= 0x20000 and cp <= 0x2A6DF) # + or (cp >= 0x2A700 and cp <= 0x2B73F) # + or (cp >= 0x2B740 and cp <= 0x2B81F) # + or (cp >= 0x2B820 and cp <= 0x2CEAF) # + or (cp >= 0xF900 and cp <= 0xFAFF) + or (cp >= 0x2F800 and cp <= 0x2FA1F) # + ): # + return True + + return False + + def _clean_text(self, text): + """Performs invalid character removal and whitespace cleanup on text.""" + output = [] + for char in text: + cp = ord(char) + if cp == 0 or cp == 0xFFFD or _is_control(char): + continue + if _is_whitespace(char): + output.append(" ") + else: + output.append(char) + return "".join(output) + + +# Copied from transformers.models.bert.tokenization_bert.WordpieceTokenizer +class WordpieceTokenizer: + """Runs WordPiece tokenization.""" + + def __init__(self, vocab, unk_token, max_input_chars_per_word=100): + self.vocab = vocab + self.unk_token = unk_token + self.max_input_chars_per_word = max_input_chars_per_word + + def tokenize(self, text): + """ + Tokenizes a piece of text into its word pieces. This uses a greedy longest-match-first algorithm to perform + tokenization using the given vocabulary. + + For example, `input = "unaffable"` will return as output `["un", "##aff", "##able"]`. + + Args: + text: A single token or whitespace separated tokens. This should have + already been passed through *BasicTokenizer*. + + Returns: + A list of wordpiece tokens. + """ + + output_tokens = [] + for token in whitespace_tokenize(text): + chars = list(token) + if len(chars) > self.max_input_chars_per_word: + output_tokens.append(self.unk_token) + continue + + is_bad = False + start = 0 + sub_tokens = [] + while start < len(chars): + end = len(chars) + cur_substr = None + while start < end: + substr = "".join(chars[start:end]) + if start > 0: + substr = "##" + substr + if substr in self.vocab: + cur_substr = substr + break + end -= 1 + if cur_substr is None: + is_bad = True + break + sub_tokens.append(cur_substr) + start = end + + if is_bad: + output_tokens.append(self.unk_token) + else: + output_tokens.extend(sub_tokens) + return output_tokens + + +__all__ = ["DistilBertTokenizer"] diff --git a/transformers/src/transformers/models/distilbert/tokenization_distilbert_fast.py b/transformers/src/transformers/models/distilbert/tokenization_distilbert_fast.py new file mode 100644 index 0000000000000000000000000000000000000000..c174804dc530c863b14c3ab6e56c18117d3fa4c2 --- /dev/null +++ b/transformers/src/transformers/models/distilbert/tokenization_distilbert_fast.py @@ -0,0 +1,149 @@ +# coding=utf-8 +# Copyright 2018 The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Tokenization classes for DistilBERT.""" + +import json +from typing import Optional + +from tokenizers import normalizers + +from ...tokenization_utils_fast import PreTrainedTokenizerFast +from ...utils import logging +from .tokenization_distilbert import DistilBertTokenizer + + +logger = logging.get_logger(__name__) + +VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt", "tokenizer_file": "tokenizer.json"} + + +class DistilBertTokenizerFast(PreTrainedTokenizerFast): + r""" + Construct a "fast" DistilBERT tokenizer (backed by HuggingFace's *tokenizers* library). Based on WordPiece. + + This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should + refer to this superclass for more information regarding those methods. + + Args: + vocab_file (`str`): + File containing the vocabulary. + do_lower_case (`bool`, *optional*, defaults to `True`): + Whether or not to lowercase the input when tokenizing. + unk_token (`str`, *optional*, defaults to `"[UNK]"`): + The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this + token instead. + sep_token (`str`, *optional*, defaults to `"[SEP]"`): + The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for + sequence classification or for a text and a question for question answering. It is also used as the last + token of a sequence built with special tokens. + pad_token (`str`, *optional*, defaults to `"[PAD]"`): + The token used for padding, for example when batching sequences of different lengths. + cls_token (`str`, *optional*, defaults to `"[CLS]"`): + The classifier token which is used when doing sequence classification (classification of the whole sequence + instead of per-token classification). It is the first token of the sequence when built with special tokens. + mask_token (`str`, *optional*, defaults to `"[MASK]"`): + The token used for masking values. This is the token used when training this model with masked language + modeling. This is the token which the model will try to predict. + clean_text (`bool`, *optional*, defaults to `True`): + Whether or not to clean the text before tokenization by removing any control characters and replacing all + whitespaces by the classic one. + tokenize_chinese_chars (`bool`, *optional*, defaults to `True`): + Whether or not to tokenize Chinese characters. This should likely be deactivated for Japanese (see [this + issue](https://github.com/huggingface/transformers/issues/328)). + strip_accents (`bool`, *optional*): + Whether or not to strip all accents. If this option is not specified, then it will be determined by the + value for `lowercase` (as in the original BERT). + wordpieces_prefix (`str`, *optional*, defaults to `"##"`): + The prefix for subwords. + """ + + vocab_files_names = VOCAB_FILES_NAMES + model_input_names = ["input_ids", "attention_mask"] + slow_tokenizer_class = DistilBertTokenizer + + def __init__( + self, + vocab_file=None, + tokenizer_file=None, + do_lower_case=True, + unk_token="[UNK]", + sep_token="[SEP]", + pad_token="[PAD]", + cls_token="[CLS]", + mask_token="[MASK]", + tokenize_chinese_chars=True, + strip_accents=None, + **kwargs, + ): + super().__init__( + vocab_file, + tokenizer_file=tokenizer_file, + do_lower_case=do_lower_case, + unk_token=unk_token, + sep_token=sep_token, + pad_token=pad_token, + cls_token=cls_token, + mask_token=mask_token, + tokenize_chinese_chars=tokenize_chinese_chars, + strip_accents=strip_accents, + **kwargs, + ) + + normalizer_state = json.loads(self.backend_tokenizer.normalizer.__getstate__()) + if ( + normalizer_state.get("lowercase", do_lower_case) != do_lower_case + or normalizer_state.get("strip_accents", strip_accents) != strip_accents + or normalizer_state.get("handle_chinese_chars", tokenize_chinese_chars) != tokenize_chinese_chars + ): + normalizer_class = getattr(normalizers, normalizer_state.pop("type")) + normalizer_state["lowercase"] = do_lower_case + normalizer_state["strip_accents"] = strip_accents + normalizer_state["handle_chinese_chars"] = tokenize_chinese_chars + self.backend_tokenizer.normalizer = normalizer_class(**normalizer_state) + + self.do_lower_case = do_lower_case + + # Copied from transformers.models.bert.tokenization_bert_fast.BertTokenizerFast.build_inputs_with_special_tokens + def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None): + """ + Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and + adding special tokens. A BERT sequence has the following format: + + - single sequence: `[CLS] X [SEP]` + - pair of sequences: `[CLS] A [SEP] B [SEP]` + + Args: + token_ids_0 (`List[int]`): + List of IDs to which the special tokens will be added. + token_ids_1 (`List[int]`, *optional*): + Optional second list of IDs for sequence pairs. + + Returns: + `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens. + """ + output = [self.cls_token_id] + token_ids_0 + [self.sep_token_id] + + if token_ids_1 is not None: + output += token_ids_1 + [self.sep_token_id] + + return output + + # Copied from transformers.models.bert.tokenization_bert_fast.BertTokenizerFast.save_vocabulary + def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]: + files = self._tokenizer.model.save(save_directory, name=filename_prefix) + return tuple(files) + + +__all__ = ["DistilBertTokenizerFast"] diff --git a/transformers/src/transformers/models/donut/__init__.py b/transformers/src/transformers/models/donut/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..834c451f78fa0d4c5fe91f59719b6505c4c4e4e5 --- /dev/null +++ b/transformers/src/transformers/models/donut/__init__.py @@ -0,0 +1,31 @@ +# Copyright 2024 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import TYPE_CHECKING + +from ...utils import _LazyModule +from ...utils.import_utils import define_import_structure + + +if TYPE_CHECKING: + from .configuration_donut_swin import * + from .feature_extraction_donut import * + from .image_processing_donut import * + from .image_processing_donut_fast import * + from .modeling_donut_swin import * + from .processing_donut import * +else: + import sys + + _file = globals()["__file__"] + sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__) diff --git a/transformers/src/transformers/models/donut/configuration_donut_swin.py b/transformers/src/transformers/models/donut/configuration_donut_swin.py new file mode 100644 index 0000000000000000000000000000000000000000..9aac07dace7688273be0bdc57da0a12663c2fb5b --- /dev/null +++ b/transformers/src/transformers/models/donut/configuration_donut_swin.py @@ -0,0 +1,135 @@ +# coding=utf-8 +# Copyright 2022 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Donut Swin Transformer model configuration""" + +from ...configuration_utils import PretrainedConfig +from ...utils import logging + + +logger = logging.get_logger(__name__) + + +class DonutSwinConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`DonutSwinModel`]. It is used to instantiate a + Donut model according to the specified arguments, defining the model architecture. Instantiating a configuration + with the defaults will yield a similar configuration to that of the Donut + [naver-clova-ix/donut-base](https://huggingface.co/naver-clova-ix/donut-base) architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + Args: + image_size (`int`, *optional*, defaults to 224): + The size (resolution) of each image. + patch_size (`int`, *optional*, defaults to 4): + The size (resolution) of each patch. + num_channels (`int`, *optional*, defaults to 3): + The number of input channels. + embed_dim (`int`, *optional*, defaults to 96): + Dimensionality of patch embedding. + depths (`list(int)`, *optional*, defaults to `[2, 2, 6, 2]`): + Depth of each layer in the Transformer encoder. + num_heads (`list(int)`, *optional*, defaults to `[3, 6, 12, 24]`): + Number of attention heads in each layer of the Transformer encoder. + window_size (`int`, *optional*, defaults to 7): + Size of windows. + mlp_ratio (`float`, *optional*, defaults to 4.0): + Ratio of MLP hidden dimensionality to embedding dimensionality. + qkv_bias (`bool`, *optional*, defaults to `True`): + Whether or not a learnable bias should be added to the queries, keys and values. + hidden_dropout_prob (`float`, *optional*, defaults to 0.0): + The dropout probability for all fully connected layers in the embeddings and encoder. + attention_probs_dropout_prob (`float`, *optional*, defaults to 0.0): + The dropout ratio for the attention probabilities. + drop_path_rate (`float`, *optional*, defaults to 0.1): + Stochastic depth rate. + hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`): + The non-linear activation function (function or string) in the encoder. If string, `"gelu"`, `"relu"`, + `"selu"` and `"gelu_new"` are supported. + use_absolute_embeddings (`bool`, *optional*, defaults to `False`): + Whether or not to add absolute position embeddings to the patch embeddings. + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + layer_norm_eps (`float`, *optional*, defaults to 1e-05): + The epsilon used by the layer normalization layers. + + Example: + + ```python + >>> from transformers import DonutSwinConfig, DonutSwinModel + + >>> # Initializing a Donut naver-clova-ix/donut-base style configuration + >>> configuration = DonutSwinConfig() + + >>> # Randomly initializing a model from the naver-clova-ix/donut-base style configuration + >>> model = DonutSwinModel(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + + model_type = "donut-swin" + + attribute_map = { + "num_attention_heads": "num_heads", + "num_hidden_layers": "num_layers", + } + + def __init__( + self, + image_size=224, + patch_size=4, + num_channels=3, + embed_dim=96, + depths=[2, 2, 6, 2], + num_heads=[3, 6, 12, 24], + window_size=7, + mlp_ratio=4.0, + qkv_bias=True, + hidden_dropout_prob=0.0, + attention_probs_dropout_prob=0.0, + drop_path_rate=0.1, + hidden_act="gelu", + use_absolute_embeddings=False, + initializer_range=0.02, + layer_norm_eps=1e-5, + **kwargs, + ): + super().__init__(**kwargs) + + self.image_size = image_size + self.patch_size = patch_size + self.num_channels = num_channels + self.embed_dim = embed_dim + self.depths = depths + self.num_layers = len(depths) + self.num_heads = num_heads + self.window_size = window_size + self.mlp_ratio = mlp_ratio + self.qkv_bias = qkv_bias + self.hidden_dropout_prob = hidden_dropout_prob + self.attention_probs_dropout_prob = attention_probs_dropout_prob + self.drop_path_rate = drop_path_rate + self.hidden_act = hidden_act + self.use_absolute_embeddings = use_absolute_embeddings + self.layer_norm_eps = layer_norm_eps + self.initializer_range = initializer_range + # we set the hidden_size attribute in order to make Swin work with VisionEncoderDecoderModel + # this indicates the channel dimension after the last stage of the model + self.hidden_size = int(embed_dim * 2 ** (len(depths) - 1)) + + +__all__ = ["DonutSwinConfig"] diff --git a/transformers/src/transformers/models/donut/convert_donut_to_pytorch.py b/transformers/src/transformers/models/donut/convert_donut_to_pytorch.py new file mode 100644 index 0000000000000000000000000000000000000000..f6f14f6d08e31037389f448815242b388545fd15 --- /dev/null +++ b/transformers/src/transformers/models/donut/convert_donut_to_pytorch.py @@ -0,0 +1,234 @@ +# coding=utf-8 +# Copyright 2022 The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Convert Donut checkpoints using the original `donut-python` library. URL: https://github.com/clovaai/donut""" + +import argparse + +import torch +from datasets import load_dataset +from donut import DonutModel + +from transformers import ( + DonutImageProcessor, + DonutProcessor, + DonutSwinConfig, + DonutSwinModel, + MBartConfig, + MBartForCausalLM, + VisionEncoderDecoderModel, + XLMRobertaTokenizerFast, +) + + +def get_configs(model): + original_config = model.config + + encoder_config = DonutSwinConfig( + image_size=original_config.input_size, + patch_size=4, + depths=original_config.encoder_layer, + num_heads=[4, 8, 16, 32], + window_size=original_config.window_size, + embed_dim=128, + ) + decoder_config = MBartConfig( + is_decoder=True, + is_encoder_decoder=False, + add_cross_attention=True, + decoder_layers=original_config.decoder_layer, + max_position_embeddings=original_config.max_position_embeddings, + vocab_size=len( + model.decoder.tokenizer + ), # several special tokens are added to the vocab of XLMRobertaTokenizer, see repo on the hub (added_tokens.json) + scale_embedding=True, + add_final_layer_norm=True, + ) + + return encoder_config, decoder_config + + +def rename_key(name): + if "encoder.model" in name: + name = name.replace("encoder.model", "encoder") + if "decoder.model" in name: + name = name.replace("decoder.model", "decoder") + if "patch_embed.proj" in name: + name = name.replace("patch_embed.proj", "embeddings.patch_embeddings.projection") + if "patch_embed.norm" in name: + name = name.replace("patch_embed.norm", "embeddings.norm") + if name.startswith("encoder"): + if "layers" in name: + name = "encoder." + name + if "attn.proj" in name: + name = name.replace("attn.proj", "attention.output.dense") + if "attn" in name and "mask" not in name: + name = name.replace("attn", "attention.self") + if "norm1" in name: + name = name.replace("norm1", "layernorm_before") + if "norm2" in name: + name = name.replace("norm2", "layernorm_after") + if "mlp.fc1" in name: + name = name.replace("mlp.fc1", "intermediate.dense") + if "mlp.fc2" in name: + name = name.replace("mlp.fc2", "output.dense") + + if name == "encoder.norm.weight": + name = "encoder.layernorm.weight" + if name == "encoder.norm.bias": + name = "encoder.layernorm.bias" + + return name + + +def convert_state_dict(orig_state_dict, model): + for key in orig_state_dict.copy().keys(): + val = orig_state_dict.pop(key) + + if "qkv" in key: + key_split = key.split(".") + layer_num = int(key_split[3]) + block_num = int(key_split[5]) + dim = model.encoder.encoder.layers[layer_num].blocks[block_num].attention.self.all_head_size + + if "weight" in key: + orig_state_dict[ + f"encoder.encoder.layers.{layer_num}.blocks.{block_num}.attention.self.query.weight" + ] = val[:dim, :] + orig_state_dict[f"encoder.encoder.layers.{layer_num}.blocks.{block_num}.attention.self.key.weight"] = ( + val[dim : dim * 2, :] + ) + orig_state_dict[ + f"encoder.encoder.layers.{layer_num}.blocks.{block_num}.attention.self.value.weight" + ] = val[-dim:, :] + else: + orig_state_dict[f"encoder.encoder.layers.{layer_num}.blocks.{block_num}.attention.self.query.bias"] = ( + val[:dim] + ) + orig_state_dict[f"encoder.encoder.layers.{layer_num}.blocks.{block_num}.attention.self.key.bias"] = ( + val[dim : dim * 2] + ) + orig_state_dict[f"encoder.encoder.layers.{layer_num}.blocks.{block_num}.attention.self.value.bias"] = ( + val[-dim:] + ) + elif "attn_mask" in key or key in ["encoder.model.norm.weight", "encoder.model.norm.bias"]: + # HuggingFace implementation doesn't use attn_mask buffer + # and model doesn't use final LayerNorms for the encoder + pass + else: + orig_state_dict[rename_key(key)] = val + + return orig_state_dict + + +def convert_donut_checkpoint(model_name, pytorch_dump_folder_path=None, push_to_hub=False): + # load original model + original_model = DonutModel.from_pretrained(model_name).eval() + + # load HuggingFace model + encoder_config, decoder_config = get_configs(original_model) + encoder = DonutSwinModel(encoder_config) + decoder = MBartForCausalLM(decoder_config) + model = VisionEncoderDecoderModel(encoder=encoder, decoder=decoder) + model.eval() + + state_dict = original_model.state_dict() + new_state_dict = convert_state_dict(state_dict, model) + model.load_state_dict(new_state_dict) + + # verify results on scanned document + dataset = load_dataset("hf-internal-testing/example-documents") # no-script + image = dataset["test"][0]["image"].convert("RGB") + + tokenizer = XLMRobertaTokenizerFast.from_pretrained(model_name, from_slow=True) + image_processor = DonutImageProcessor( + do_align_long_axis=original_model.config.align_long_axis, size=original_model.config.input_size[::-1] + ) + processor = DonutProcessor(image_processor, tokenizer) + pixel_values = processor(image, return_tensors="pt").pixel_values + + if model_name == "naver-clova-ix/donut-base-finetuned-docvqa": + task_prompt = "{user_input}" + question = "When is the coffee break?" + task_prompt = task_prompt.replace("{user_input}", question) + elif model_name == "naver-clova-ix/donut-base-finetuned-rvlcdip": + task_prompt = "" + elif model_name in [ + "naver-clova-ix/donut-base-finetuned-cord-v1", + "naver-clova-ix/donut-base-finetuned-cord-v1-2560", + ]: + task_prompt = "" + elif model_name == "naver-clova-ix/donut-base-finetuned-cord-v2": + task_prompt = "s_cord-v2>" + elif model_name == "naver-clova-ix/donut-base-finetuned-zhtrainticket": + task_prompt = "" + elif model_name in ["naver-clova-ix/donut-proto", "naver-clova-ix/donut-base"]: + # use a random prompt + task_prompt = "hello world" + else: + raise ValueError("Model name not supported") + prompt_tensors = original_model.decoder.tokenizer(task_prompt, add_special_tokens=False, return_tensors="pt")[ + "input_ids" + ] + + original_patch_embed = original_model.encoder.model.patch_embed(pixel_values) + patch_embeddings, _ = model.encoder.embeddings(pixel_values) + assert torch.allclose(original_patch_embed, patch_embeddings, atol=1e-3) + + # verify encoder hidden states + original_last_hidden_state = original_model.encoder(pixel_values) + last_hidden_state = model.encoder(pixel_values).last_hidden_state + assert torch.allclose(original_last_hidden_state, last_hidden_state, atol=1e-2) + + # verify decoder hidden states + original_logits = original_model(pixel_values, prompt_tensors, None).logits + logits = model(pixel_values, decoder_input_ids=prompt_tensors).logits + assert torch.allclose(original_logits, logits, atol=1e-3) + print("Looks ok!") + + if pytorch_dump_folder_path is not None: + print(f"Saving model and processor to {pytorch_dump_folder_path}") + model.save_pretrained(pytorch_dump_folder_path) + processor.save_pretrained(pytorch_dump_folder_path) + + if push_to_hub: + model.push_to_hub("nielsr/" + model_name.split("/")[-1], commit_message="Update model") + processor.push_to_hub("nielsr/" + model_name.split("/")[-1], commit_message="Update model") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + # Required parameters + parser.add_argument( + "--model_name", + default="naver-clova-ix/donut-base-finetuned-docvqa", + required=False, + type=str, + help="Name of the original model you'd like to convert.", + ) + parser.add_argument( + "--pytorch_dump_folder_path", + default=None, + required=False, + type=str, + help="Path to the output PyTorch model directory.", + ) + parser.add_argument( + "--push_to_hub", + action="store_true", + help="Whether or not to push the converted model and processor to the 🤗 hub.", + ) + + args = parser.parse_args() + convert_donut_checkpoint(args.model_name, args.pytorch_dump_folder_path, args.push_to_hub) diff --git a/transformers/src/transformers/models/donut/feature_extraction_donut.py b/transformers/src/transformers/models/donut/feature_extraction_donut.py new file mode 100644 index 0000000000000000000000000000000000000000..e37a58ddd3055e040c6c29cbd5f5cc3c34270cbe --- /dev/null +++ b/transformers/src/transformers/models/donut/feature_extraction_donut.py @@ -0,0 +1,38 @@ +# coding=utf-8 +# Copyright 2022 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Feature extractor class for Donut.""" + +import warnings + +from ...utils import logging +from ...utils.import_utils import requires +from .image_processing_donut import DonutImageProcessor + + +logger = logging.get_logger(__name__) + + +@requires(backends=("vision",)) +class DonutFeatureExtractor(DonutImageProcessor): + def __init__(self, *args, **kwargs) -> None: + warnings.warn( + "The class DonutFeatureExtractor is deprecated and will be removed in version 5 of Transformers. Please" + " use DonutImageProcessor instead.", + FutureWarning, + ) + super().__init__(*args, **kwargs) + + +__all__ = ["DonutFeatureExtractor"] diff --git a/transformers/src/transformers/models/donut/image_processing_donut.py b/transformers/src/transformers/models/donut/image_processing_donut.py new file mode 100644 index 0000000000000000000000000000000000000000..e8a48dcabef664abf04d6f66cb4285a4da8c2620 --- /dev/null +++ b/transformers/src/transformers/models/donut/image_processing_donut.py @@ -0,0 +1,477 @@ +# coding=utf-8 +# Copyright 2022 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Image processor class for Donut.""" + +from typing import Optional, Union + +import numpy as np + +from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict +from ...image_transforms import ( + convert_to_rgb, + get_resize_output_image_size, + pad, + resize, + to_channel_dimension_format, +) +from ...image_utils import ( + IMAGENET_STANDARD_MEAN, + IMAGENET_STANDARD_STD, + ChannelDimension, + ImageInput, + PILImageResampling, + get_image_size, + infer_channel_dimension_format, + is_scaled_image, + make_list_of_images, + to_numpy_array, + valid_images, + validate_preprocess_arguments, +) +from ...utils import TensorType, filter_out_non_signature_kwargs, logging +from ...utils.import_utils import is_vision_available, requires + + +logger = logging.get_logger(__name__) + + +if is_vision_available(): + import PIL + + +@requires(backends=("vision",)) +class DonutImageProcessor(BaseImageProcessor): + r""" + Constructs a Donut image processor. + + Args: + do_resize (`bool`, *optional*, defaults to `True`): + Whether to resize the image's (height, width) dimensions to the specified `size`. Can be overridden by + `do_resize` in the `preprocess` method. + size (`dict[str, int]` *optional*, defaults to `{"shortest_edge": 224}`): + Size of the image after resizing. The shortest edge of the image is resized to size["shortest_edge"], with + the longest edge resized to keep the input aspect ratio. Can be overridden by `size` in the `preprocess` + method. + resample (`PILImageResampling`, *optional*, defaults to `Resampling.BILINEAR`): + Resampling filter to use if resizing the image. Can be overridden by `resample` in the `preprocess` method. + do_thumbnail (`bool`, *optional*, defaults to `True`): + Whether to resize the image using thumbnail method. + do_align_long_axis (`bool`, *optional*, defaults to `False`): + Whether to align the long axis of the image with the long axis of `size` by rotating by 90 degrees. + do_pad (`bool`, *optional*, defaults to `True`): + Whether to pad the image. If `random_padding` is set to `True` in `preprocess`, each image is padded with a + random amount of padding on each size, up to the largest image size in the batch. Otherwise, all images are + padded to the largest image size in the batch. + do_rescale (`bool`, *optional*, defaults to `True`): + Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by `do_rescale` in + the `preprocess` method. + rescale_factor (`int` or `float`, *optional*, defaults to `1/255`): + Scale factor to use if rescaling the image. Can be overridden by `rescale_factor` in the `preprocess` + method. + do_normalize (`bool`, *optional*, defaults to `True`): + Whether to normalize the image. Can be overridden by `do_normalize` in the `preprocess` method. + image_mean (`float` or `list[float]`, *optional*, defaults to `IMAGENET_STANDARD_MEAN`): + Mean to use if normalizing the image. This is a float or list of floats the length of the number of + channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method. + image_std (`float` or `list[float]`, *optional*, defaults to `IMAGENET_STANDARD_STD`): + Image standard deviation. + """ + + model_input_names = ["pixel_values"] + + def __init__( + self, + do_resize: bool = True, + size: Optional[dict[str, int]] = None, + resample: PILImageResampling = PILImageResampling.BILINEAR, + do_thumbnail: bool = True, + do_align_long_axis: bool = False, + do_pad: bool = True, + do_rescale: bool = True, + rescale_factor: Union[int, float] = 1 / 255, + do_normalize: bool = True, + image_mean: Optional[Union[float, list[float]]] = None, + image_std: Optional[Union[float, list[float]]] = None, + **kwargs, + ) -> None: + super().__init__(**kwargs) + + size = size if size is not None else {"height": 2560, "width": 1920} + if isinstance(size, (tuple, list)): + # The previous feature extractor size parameter was in (width, height) format + size = size[::-1] + size = get_size_dict(size) + + self.do_resize = do_resize + self.size = size + self.resample = resample + self.do_thumbnail = do_thumbnail + self.do_align_long_axis = do_align_long_axis + self.do_pad = do_pad + self.do_rescale = do_rescale + self.rescale_factor = rescale_factor + self.do_normalize = do_normalize + self.image_mean = image_mean if image_mean is not None else IMAGENET_STANDARD_MEAN + self.image_std = image_std if image_std is not None else IMAGENET_STANDARD_STD + + def align_long_axis( + self, + image: np.ndarray, + size: dict[str, int], + data_format: Optional[Union[str, ChannelDimension]] = None, + input_data_format: Optional[Union[str, ChannelDimension]] = None, + ) -> np.ndarray: + """ + Align the long axis of the image to the longest axis of the specified size. + + Args: + image (`np.ndarray`): + The image to be aligned. + size (`dict[str, int]`): + The size `{"height": h, "width": w}` to align the long axis to. + data_format (`str` or `ChannelDimension`, *optional*): + The data format of the output image. If unset, the same format as the input image is used. + input_data_format (`ChannelDimension` or `str`, *optional*): + The channel dimension format of the input image. If not provided, it will be inferred. + + Returns: + `np.ndarray`: The aligned image. + """ + input_height, input_width = get_image_size(image, channel_dim=input_data_format) + output_height, output_width = size["height"], size["width"] + + if input_data_format is None: + # We assume that all images have the same channel dimension format. + input_data_format = infer_channel_dimension_format(image) + + if input_data_format == ChannelDimension.LAST: + rot_axes = (0, 1) + elif input_data_format == ChannelDimension.FIRST: + rot_axes = (1, 2) + else: + raise ValueError(f"Unsupported data format: {input_data_format}") + + if (output_width < output_height and input_width > input_height) or ( + output_width > output_height and input_width < input_height + ): + image = np.rot90(image, 3, axes=rot_axes) + + if data_format is not None: + image = to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format) + + return image + + def pad_image( + self, + image: np.ndarray, + size: dict[str, int], + random_padding: bool = False, + data_format: Optional[Union[str, ChannelDimension]] = None, + input_data_format: Optional[Union[str, ChannelDimension]] = None, + ) -> np.ndarray: + """ + Pad the image to the specified size. + + Args: + image (`np.ndarray`): + The image to be padded. + size (`dict[str, int]`): + The size `{"height": h, "width": w}` to pad the image to. + random_padding (`bool`, *optional*, defaults to `False`): + Whether to use random padding or not. + data_format (`str` or `ChannelDimension`, *optional*): + The data format of the output image. If unset, the same format as the input image is used. + input_data_format (`ChannelDimension` or `str`, *optional*): + The channel dimension format of the input image. If not provided, it will be inferred. + """ + output_height, output_width = size["height"], size["width"] + input_height, input_width = get_image_size(image, channel_dim=input_data_format) + + delta_width = output_width - input_width + delta_height = output_height - input_height + + if random_padding: + pad_top = np.random.randint(low=0, high=delta_height + 1) + pad_left = np.random.randint(low=0, high=delta_width + 1) + else: + pad_top = delta_height // 2 + pad_left = delta_width // 2 + + pad_bottom = delta_height - pad_top + pad_right = delta_width - pad_left + + padding = ((pad_top, pad_bottom), (pad_left, pad_right)) + return pad(image, padding, data_format=data_format, input_data_format=input_data_format) + + def pad(self, *args, **kwargs): + logger.info("pad is deprecated and will be removed in version 4.27. Please use pad_image instead.") + return self.pad_image(*args, **kwargs) + + def thumbnail( + self, + image: np.ndarray, + size: dict[str, int], + resample: PILImageResampling = PILImageResampling.BICUBIC, + data_format: Optional[Union[str, ChannelDimension]] = None, + input_data_format: Optional[Union[str, ChannelDimension]] = None, + **kwargs, + ) -> np.ndarray: + """ + Resize the image to make a thumbnail. The image is resized so that no dimension is larger than any + corresponding dimension of the specified size. + + Args: + image (`np.ndarray`): + The image to be resized. + size (`dict[str, int]`): + The size `{"height": h, "width": w}` to resize the image to. + resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BICUBIC`): + The resampling filter to use. + data_format (`Optional[Union[str, ChannelDimension]]`, *optional*): + The data format of the output image. If unset, the same format as the input image is used. + input_data_format (`ChannelDimension` or `str`, *optional*): + The channel dimension format of the input image. If not provided, it will be inferred. + """ + input_height, input_width = get_image_size(image, channel_dim=input_data_format) + output_height, output_width = size["height"], size["width"] + + # We always resize to the smallest of either the input or output size. + height = min(input_height, output_height) + width = min(input_width, output_width) + + if height == input_height and width == input_width: + return image + + if input_height > input_width: + width = int(input_width * height / input_height) + elif input_width > input_height: + height = int(input_height * width / input_width) + + return resize( + image, + size=(height, width), + resample=resample, + reducing_gap=2.0, + data_format=data_format, + input_data_format=input_data_format, + **kwargs, + ) + + def resize( + self, + image: np.ndarray, + size: dict[str, int], + resample: PILImageResampling = PILImageResampling.BICUBIC, + data_format: Optional[Union[str, ChannelDimension]] = None, + input_data_format: Optional[Union[str, ChannelDimension]] = None, + **kwargs, + ) -> np.ndarray: + """ + Resizes `image` to `(height, width)` specified by `size` using the PIL library. + + Args: + image (`np.ndarray`): + Image to resize. + size (`dict[str, int]`): + Size of the output image. + resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BICUBIC`): + Resampling filter to use when resiizing the image. + data_format (`str` or `ChannelDimension`, *optional*): + The channel dimension format of the image. If not provided, it will be the same as the input image. + input_data_format (`ChannelDimension` or `str`, *optional*): + The channel dimension format of the input image. If not provided, it will be inferred. + """ + size = get_size_dict(size) + shortest_edge = min(size["height"], size["width"]) + output_size = get_resize_output_image_size( + image, size=shortest_edge, default_to_square=False, input_data_format=input_data_format + ) + resized_image = resize( + image, + size=output_size, + resample=resample, + data_format=data_format, + input_data_format=input_data_format, + **kwargs, + ) + return resized_image + + @filter_out_non_signature_kwargs() + def preprocess( + self, + images: ImageInput, + do_resize: Optional[bool] = None, + size: Optional[dict[str, int]] = None, + resample: PILImageResampling = None, + do_thumbnail: Optional[bool] = None, + do_align_long_axis: Optional[bool] = None, + do_pad: Optional[bool] = None, + random_padding: bool = False, + do_rescale: Optional[bool] = None, + rescale_factor: Optional[float] = None, + do_normalize: Optional[bool] = None, + image_mean: Optional[Union[float, list[float]]] = None, + image_std: Optional[Union[float, list[float]]] = None, + return_tensors: Optional[Union[str, TensorType]] = None, + data_format: Optional[ChannelDimension] = ChannelDimension.FIRST, + input_data_format: Optional[Union[str, ChannelDimension]] = None, + ) -> PIL.Image.Image: + """ + Preprocess an image or batch of images. + + Args: + images (`ImageInput`): + Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If + passing in images with pixel values between 0 and 1, set `do_rescale=False`. + do_resize (`bool`, *optional*, defaults to `self.do_resize`): + Whether to resize the image. + size (`dict[str, int]`, *optional*, defaults to `self.size`): + Size of the image after resizing. Shortest edge of the image is resized to min(size["height"], + size["width"]) with the longest edge resized to keep the input aspect ratio. + resample (`int`, *optional*, defaults to `self.resample`): + Resampling filter to use if resizing the image. This can be one of the enum `PILImageResampling`. Only + has an effect if `do_resize` is set to `True`. + do_thumbnail (`bool`, *optional*, defaults to `self.do_thumbnail`): + Whether to resize the image using thumbnail method. + do_align_long_axis (`bool`, *optional*, defaults to `self.do_align_long_axis`): + Whether to align the long axis of the image with the long axis of `size` by rotating by 90 degrees. + do_pad (`bool`, *optional*, defaults to `self.do_pad`): + Whether to pad the image. If `random_padding` is set to `True`, each image is padded with a random + amount of padding on each size, up to the largest image size in the batch. Otherwise, all images are + padded to the largest image size in the batch. + random_padding (`bool`, *optional*, defaults to `self.random_padding`): + Whether to use random padding when padding the image. If `True`, each image in the batch with be padded + with a random amount of padding on each side up to the size of the largest image in the batch. + do_rescale (`bool`, *optional*, defaults to `self.do_rescale`): + Whether to rescale the image pixel values. + rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`): + Rescale factor to rescale the image by if `do_rescale` is set to `True`. + do_normalize (`bool`, *optional*, defaults to `self.do_normalize`): + Whether to normalize the image. + image_mean (`float` or `list[float]`, *optional*, defaults to `self.image_mean`): + Image mean to use for normalization. + image_std (`float` or `list[float]`, *optional*, defaults to `self.image_std`): + Image standard deviation to use for normalization. + return_tensors (`str` or `TensorType`, *optional*): + The type of tensors to return. Can be one of: + - Unset: Return a list of `np.ndarray`. + - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`. + - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`. + - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`. + - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`. + data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`): + The channel dimension format for the output image. Can be one of: + - `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `ChannelDimension.LAST`: image in (height, width, num_channels) format. + - Unset: defaults to the channel dimension format of the input image. + input_data_format (`ChannelDimension` or `str`, *optional*): + The channel dimension format for the input image. If unset, the channel dimension format is inferred + from the input image. Can be one of: + - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. + - `"none"` or `ChannelDimension.NONE`: image in (height, width) format. + """ + do_resize = do_resize if do_resize is not None else self.do_resize + size = size if size is not None else self.size + if isinstance(size, (tuple, list)): + # Previous feature extractor had size in (width, height) format + size = size[::-1] + size = get_size_dict(size) + resample = resample if resample is not None else self.resample + do_thumbnail = do_thumbnail if do_thumbnail is not None else self.do_thumbnail + do_align_long_axis = do_align_long_axis if do_align_long_axis is not None else self.do_align_long_axis + do_pad = do_pad if do_pad is not None else self.do_pad + do_rescale = do_rescale if do_rescale is not None else self.do_rescale + rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor + do_normalize = do_normalize if do_normalize is not None else self.do_normalize + image_mean = image_mean if image_mean is not None else self.image_mean + image_std = image_std if image_std is not None else self.image_std + + images = make_list_of_images(images) + + if not valid_images(images): + raise ValueError( + "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, " + "torch.Tensor, tf.Tensor or jax.ndarray." + ) + validate_preprocess_arguments( + do_rescale=do_rescale, + rescale_factor=rescale_factor, + do_normalize=do_normalize, + image_mean=image_mean, + image_std=image_std, + do_pad=do_pad, + size_divisibility=size, # There is no pad divisibility in this processor, but pad requires the size arg. + do_resize=do_resize, + size=size, + resample=resample, + ) + + images = [convert_to_rgb(image) for image in images] + + # All transformations expect numpy arrays. + images = [to_numpy_array(image) for image in images] + + if do_rescale and is_scaled_image(images[0]): + logger.warning_once( + "It looks like you are trying to rescale already rescaled images. If the input" + " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again." + ) + + if input_data_format is None: + # We assume that all images have the same channel dimension format. + input_data_format = infer_channel_dimension_format(images[0]) + + if do_align_long_axis: + images = [self.align_long_axis(image, size=size, input_data_format=input_data_format) for image in images] + + if do_resize: + images = [ + self.resize(image=image, size=size, resample=resample, input_data_format=input_data_format) + for image in images + ] + + if do_thumbnail: + images = [self.thumbnail(image=image, size=size, input_data_format=input_data_format) for image in images] + + if do_pad: + images = [ + self.pad_image( + image=image, size=size, random_padding=random_padding, input_data_format=input_data_format + ) + for image in images + ] + + if do_rescale: + images = [ + self.rescale(image=image, scale=rescale_factor, input_data_format=input_data_format) + for image in images + ] + + if do_normalize: + images = [ + self.normalize(image=image, mean=image_mean, std=image_std, input_data_format=input_data_format) + for image in images + ] + + images = [ + to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format) for image in images + ] + + data = {"pixel_values": images} + return BatchFeature(data=data, tensor_type=return_tensors) + + +__all__ = ["DonutImageProcessor"] diff --git a/transformers/src/transformers/models/donut/image_processing_donut_fast.py b/transformers/src/transformers/models/donut/image_processing_donut_fast.py new file mode 100644 index 0000000000000000000000000000000000000000..8ec0235544170b941a63930816370c1f7934cf48 --- /dev/null +++ b/transformers/src/transformers/models/donut/image_processing_donut_fast.py @@ -0,0 +1,265 @@ +# coding=utf-8 +# Copyright 2025 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Fast Image processor class for Donut.""" + +from typing import Optional, Union + +from ...image_processing_utils_fast import BaseImageProcessorFast, BatchFeature, DefaultFastImageProcessorKwargs +from ...image_transforms import group_images_by_shape, reorder_images +from ...image_utils import IMAGENET_STANDARD_MEAN, IMAGENET_STANDARD_STD, ImageInput, PILImageResampling, SizeDict +from ...processing_utils import Unpack +from ...utils import ( + TensorType, + auto_docstring, + is_torch_available, + is_torchvision_available, + is_torchvision_v2_available, + logging, +) + + +logger = logging.get_logger(__name__) + +if is_torch_available(): + import torch + +if is_torchvision_available(): + if is_torchvision_v2_available(): + from torchvision.transforms.v2 import functional as F + else: + from torchvision.transforms import functional as F + + +class DonutFastImageProcessorKwargs(DefaultFastImageProcessorKwargs): + """ + Args: + do_thumbnail (`bool`, *optional*, defaults to `self.do_thumbnail`): + Whether to resize the image using thumbnail method. + do_align_long_axis (`bool`, *optional*, defaults to `self.do_align_long_axis`): + Whether to align the long axis of the image with the long axis of `size` by rotating by 90 degrees. + do_pad (`bool`, *optional*, defaults to `self.do_pad`): + Whether to pad the image. If `random_padding` is set to `True`, each image is padded with a random + amount of padding on each size, up to the largest image size in the batch. Otherwise, all images are + padded to the largest image size in the batch. + """ + + do_thumbnail: Optional[bool] + do_align_long_axis: Optional[bool] + do_pad: Optional[bool] + + +@auto_docstring +class DonutImageProcessorFast(BaseImageProcessorFast): + resample = PILImageResampling.BILINEAR + image_mean = IMAGENET_STANDARD_MEAN + image_std = IMAGENET_STANDARD_STD + size = {"height": 2560, "width": 1920} + do_resize = True + do_rescale = True + do_normalize = True + do_thumbnail = True + do_align_long_axis = False + do_pad = True + valid_kwargs = DonutFastImageProcessorKwargs + + def __init__(self, **kwargs: Unpack[DonutFastImageProcessorKwargs]): + size = kwargs.pop("size", None) + if isinstance(size, (tuple, list)): + size = size[::-1] + kwargs["size"] = size + super().__init__(**kwargs) + + @auto_docstring + def preprocess(self, images: ImageInput, **kwargs: Unpack[DonutFastImageProcessorKwargs]) -> BatchFeature: + if "size" in kwargs: + size = kwargs.pop("size") + if isinstance(size, (tuple, list)): + size = size[::-1] + kwargs["size"] = size + return super().preprocess(images, **kwargs) + + def align_long_axis( + self, + image: "torch.Tensor", + size: SizeDict, + ) -> "torch.Tensor": + """ + Align the long axis of the image to the longest axis of the specified size. + + Args: + image (`torch.Tensor`): + The image to be aligned. + size (`dict[str, int]`): + The size `{"height": h, "width": w}` to align the long axis to. + + Returns: + `torch.Tensor`: The aligned image. + """ + input_height, input_width = image.shape[-2:] + output_height, output_width = size.height, size.width + + if (output_width < output_height and input_width > input_height) or ( + output_width > output_height and input_width < input_height + ): + height_dim, width_dim = image.dim() - 2, image.dim() - 1 + image = torch.rot90(image, 3, dims=[height_dim, width_dim]) + + return image + + def pad_image( + self, + image: "torch.Tensor", + size: SizeDict, + random_padding: bool = False, + ) -> "torch.Tensor": + """ + Pad the image to the specified size. + + Args: + image (`torch.Tensor`): + The image to be padded. + size (`dict[str, int]`): + The size `{"height": h, "width": w}` to pad the image to. + random_padding (`bool`, *optional*, defaults to `False`): + Whether to use random padding or not. + data_format (`str` or `ChannelDimension`, *optional*): + The data format of the output image. If unset, the same format as the input image is used. + input_data_format (`ChannelDimension` or `str`, *optional*): + The channel dimension format of the input image. If not provided, it will be inferred. + """ + output_height, output_width = size.height, size.width + input_height, input_width = image.shape[-2:] + + delta_width = output_width - input_width + delta_height = output_height - input_height + + if random_padding: + pad_top = torch.random.randint(low=0, high=delta_height + 1) + pad_left = torch.random.randint(low=0, high=delta_width + 1) + else: + pad_top = delta_height // 2 + pad_left = delta_width // 2 + + pad_bottom = delta_height - pad_top + pad_right = delta_width - pad_left + + padding = (pad_left, pad_top, pad_right, pad_bottom) + return F.pad(image, padding) + + def pad(self, *args, **kwargs): + logger.info("pad is deprecated and will be removed in version 4.27. Please use pad_image instead.") + return self.pad_image(*args, **kwargs) + + def thumbnail( + self, + image: "torch.Tensor", + size: SizeDict, + ) -> "torch.Tensor": + """ + Resize the image to make a thumbnail. The image is resized so that no dimension is larger than any + corresponding dimension of the specified size. + + Args: + image (`torch.Tensor`): + The image to be resized. + size (`dict[str, int]`): + The size `{"height": h, "width": w}` to resize the image to. + resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BICUBIC`): + The resampling filter to use. + data_format (`Optional[Union[str, ChannelDimension]]`, *optional*): + The data format of the output image. If unset, the same format as the input image is used. + input_data_format (`ChannelDimension` or `str`, *optional*): + The channel dimension format of the input image. If not provided, it will be inferred. + """ + input_height, input_width = image.shape[-2:] + output_height, output_width = size.height, size.width + + # We always resize to the smallest of either the input or output size. + height = min(input_height, output_height) + width = min(input_width, output_width) + + if height == input_height and width == input_width: + return image + + if input_height > input_width: + width = int(input_width * height / input_height) + elif input_width > input_height: + height = int(input_height * width / input_width) + + return self.resize( + image, + size=SizeDict(width=width, height=height), + interpolation=F.InterpolationMode.BICUBIC, + ) + + def _preprocess( + self, + images: list["torch.Tensor"], + do_resize: bool, + do_thumbnail: bool, + do_align_long_axis: bool, + do_pad: bool, + size: SizeDict, + interpolation: Optional["F.InterpolationMode"], + do_center_crop: bool, + crop_size: SizeDict, + do_rescale: bool, + rescale_factor: float, + do_normalize: bool, + image_mean: Optional[Union[float, list[float]]], + image_std: Optional[Union[float, list[float]]], + disable_grouping: Optional[bool], + return_tensors: Optional[Union[str, TensorType]], + **kwargs, + ) -> BatchFeature: + # Group images by size for batched resizing + grouped_images, grouped_images_index = group_images_by_shape(images, disable_grouping=disable_grouping) + resized_images_grouped = {} + for shape, stacked_images in grouped_images.items(): + if do_align_long_axis: + stacked_images = self.align_long_axis(image=stacked_images, size=size) + if do_resize: + shortest_edge = min(size.height, size.width) + stacked_images = self.resize( + image=stacked_images, size=SizeDict(shortest_edge=shortest_edge), interpolation=interpolation + ) + if do_thumbnail: + stacked_images = self.thumbnail(image=stacked_images, size=size) + if do_pad: + stacked_images = self.pad_image(image=stacked_images, size=size, random_padding=False) + + resized_images_grouped[shape] = stacked_images + resized_images = reorder_images(resized_images_grouped, grouped_images_index) + + # Group images by size for further processing + # Needed in case do_resize is False, or resize returns images with different sizes + grouped_images, grouped_images_index = group_images_by_shape(resized_images, disable_grouping=disable_grouping) + processed_images_grouped = {} + for shape, stacked_images in grouped_images.items(): + if do_center_crop: + stacked_images = self.center_crop(stacked_images, crop_size) + # Fused rescale and normalize + stacked_images = self.rescale_and_normalize( + stacked_images, do_rescale, rescale_factor, do_normalize, image_mean, image_std + ) + processed_images_grouped[shape] = stacked_images + + processed_images = reorder_images(processed_images_grouped, grouped_images_index) + processed_images = torch.stack(processed_images, dim=0) if return_tensors else processed_images + + return BatchFeature(data={"pixel_values": processed_images}, tensor_type=return_tensors) + + +__all__ = ["DonutImageProcessorFast"] diff --git a/transformers/src/transformers/models/donut/modeling_donut_swin.py b/transformers/src/transformers/models/donut/modeling_donut_swin.py new file mode 100644 index 0000000000000000000000000000000000000000..7af6a3ad07d88f10ffdd9d865d91d91cd2af997d --- /dev/null +++ b/transformers/src/transformers/models/donut/modeling_donut_swin.py @@ -0,0 +1,1032 @@ +# coding=utf-8 +# Copyright 2022 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""PyTorch Donut Swin Transformer model. + +This implementation is identical to a regular Swin Transformer, without final layer norm on top of the final hidden +states.""" + +import collections.abc +import math +from dataclasses import dataclass +from typing import Optional, Union + +import torch +import torch.utils.checkpoint +from torch import nn + +from ...activations import ACT2FN +from ...modeling_layers import GradientCheckpointingLayer +from ...modeling_utils import PreTrainedModel +from ...pytorch_utils import find_pruneable_heads_and_indices, meshgrid, prune_linear_layer +from ...utils import ModelOutput, auto_docstring, logging, torch_int +from .configuration_donut_swin import DonutSwinConfig + + +logger = logging.get_logger(__name__) + + +@dataclass +@auto_docstring( + custom_intro=""" + DonutSwin encoder's outputs, with potential hidden states and attentions. + """ +) +# Copied from transformers.models.swin.modeling_swin.SwinEncoderOutput with Swin->DonutSwin +class DonutSwinEncoderOutput(ModelOutput): + r""" + reshaped_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of + shape `(batch_size, hidden_size, height, width)`. + + Hidden-states of the model at the output of each layer plus the initial embedding outputs reshaped to + include the spatial dimensions. + """ + + last_hidden_state: Optional[torch.FloatTensor] = None + hidden_states: Optional[tuple[torch.FloatTensor, ...]] = None + attentions: Optional[tuple[torch.FloatTensor, ...]] = None + reshaped_hidden_states: Optional[tuple[torch.FloatTensor, ...]] = None + + +@dataclass +@auto_docstring( + custom_intro=""" + DonutSwin model's outputs that also contains a pooling of the last hidden states. + """ +) +# Copied from transformers.models.swin.modeling_swin.SwinModelOutput with Swin->DonutSwin +class DonutSwinModelOutput(ModelOutput): + r""" + pooler_output (`torch.FloatTensor` of shape `(batch_size, hidden_size)`, *optional*, returned when `add_pooling_layer=True` is passed): + Average pooling of the last layer hidden-state. + reshaped_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of + shape `(batch_size, hidden_size, height, width)`. + + Hidden-states of the model at the output of each layer plus the initial embedding outputs reshaped to + include the spatial dimensions. + """ + + last_hidden_state: Optional[torch.FloatTensor] = None + pooler_output: Optional[torch.FloatTensor] = None + hidden_states: Optional[tuple[torch.FloatTensor, ...]] = None + attentions: Optional[tuple[torch.FloatTensor, ...]] = None + reshaped_hidden_states: Optional[tuple[torch.FloatTensor, ...]] = None + + +@dataclass +@auto_docstring( + custom_intro=""" + DonutSwin outputs for image classification. + """ +) +# Copied from transformers.models.swin.modeling_swin.SwinImageClassifierOutput with Swin->DonutSwin +class DonutSwinImageClassifierOutput(ModelOutput): + r""" + loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided): + Classification (or regression if config.num_labels==1) loss. + logits (`torch.FloatTensor` of shape `(batch_size, config.num_labels)`): + Classification (or regression if config.num_labels==1) scores (before SoftMax). + reshaped_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of + shape `(batch_size, hidden_size, height, width)`. + + Hidden-states of the model at the output of each layer plus the initial embedding outputs reshaped to + include the spatial dimensions. + """ + + loss: Optional[torch.FloatTensor] = None + logits: Optional[torch.FloatTensor] = None + hidden_states: Optional[tuple[torch.FloatTensor, ...]] = None + attentions: Optional[tuple[torch.FloatTensor, ...]] = None + reshaped_hidden_states: Optional[tuple[torch.FloatTensor, ...]] = None + + +# Copied from transformers.models.swin.modeling_swin.window_partition +def window_partition(input_feature, window_size): + """ + Partitions the given input into windows. + """ + batch_size, height, width, num_channels = input_feature.shape + input_feature = input_feature.view( + batch_size, height // window_size, window_size, width // window_size, window_size, num_channels + ) + windows = input_feature.permute(0, 1, 3, 2, 4, 5).contiguous().view(-1, window_size, window_size, num_channels) + return windows + + +# Copied from transformers.models.swin.modeling_swin.window_reverse +def window_reverse(windows, window_size, height, width): + """ + Merges windows to produce higher resolution features. + """ + num_channels = windows.shape[-1] + windows = windows.view(-1, height // window_size, width // window_size, window_size, window_size, num_channels) + windows = windows.permute(0, 1, 3, 2, 4, 5).contiguous().view(-1, height, width, num_channels) + return windows + + +# Copied from transformers.models.swin.modeling_swin.SwinEmbeddings with Swin->DonutSwin +class DonutSwinEmbeddings(nn.Module): + """ + Construct the patch and position embeddings. Optionally, also the mask token. + """ + + def __init__(self, config, use_mask_token=False): + super().__init__() + + self.patch_embeddings = DonutSwinPatchEmbeddings(config) + num_patches = self.patch_embeddings.num_patches + self.patch_grid = self.patch_embeddings.grid_size + self.mask_token = nn.Parameter(torch.zeros(1, 1, config.embed_dim)) if use_mask_token else None + + if config.use_absolute_embeddings: + self.position_embeddings = nn.Parameter(torch.zeros(1, num_patches + 1, config.embed_dim)) + else: + self.position_embeddings = None + + self.norm = nn.LayerNorm(config.embed_dim) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + self.patch_size = config.patch_size + self.config = config + + # Copied from transformers.models.vit.modeling_vit.ViTEmbeddings.interpolate_pos_encoding + def interpolate_pos_encoding(self, embeddings: torch.Tensor, height: int, width: int) -> torch.Tensor: + """ + This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution + images. This method is also adapted to support torch.jit tracing. + + Adapted from: + - https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and + - https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211 + """ + + num_patches = embeddings.shape[1] - 1 + num_positions = self.position_embeddings.shape[1] - 1 + + # always interpolate when tracing to ensure the exported model works for dynamic input shapes + if not torch.jit.is_tracing() and num_patches == num_positions and height == width: + return self.position_embeddings + + class_pos_embed = self.position_embeddings[:, :1] + patch_pos_embed = self.position_embeddings[:, 1:] + + dim = embeddings.shape[-1] + + new_height = height // self.patch_size + new_width = width // self.patch_size + + sqrt_num_positions = torch_int(num_positions**0.5) + patch_pos_embed = patch_pos_embed.reshape(1, sqrt_num_positions, sqrt_num_positions, dim) + patch_pos_embed = patch_pos_embed.permute(0, 3, 1, 2) + + patch_pos_embed = nn.functional.interpolate( + patch_pos_embed, + size=(new_height, new_width), + mode="bicubic", + align_corners=False, + ) + + patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).view(1, -1, dim) + + return torch.cat((class_pos_embed, patch_pos_embed), dim=1) + + def forward( + self, + pixel_values: Optional[torch.FloatTensor], + bool_masked_pos: Optional[torch.BoolTensor] = None, + interpolate_pos_encoding: bool = False, + ) -> tuple[torch.Tensor]: + _, num_channels, height, width = pixel_values.shape + embeddings, output_dimensions = self.patch_embeddings(pixel_values) + embeddings = self.norm(embeddings) + batch_size, seq_len, _ = embeddings.size() + + if bool_masked_pos is not None: + mask_tokens = self.mask_token.expand(batch_size, seq_len, -1) + # replace the masked visual tokens by mask_tokens + mask = bool_masked_pos.unsqueeze(-1).type_as(mask_tokens) + embeddings = embeddings * (1.0 - mask) + mask_tokens * mask + + if self.position_embeddings is not None: + if interpolate_pos_encoding: + embeddings = embeddings + self.interpolate_pos_encoding(embeddings, height, width) + else: + embeddings = embeddings + self.position_embeddings + + embeddings = self.dropout(embeddings) + + return embeddings, output_dimensions + + +# Copied from transformers.models.swin.modeling_swin.SwinPatchEmbeddings with Swin->DonutSwin +class DonutSwinPatchEmbeddings(nn.Module): + """ + This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial + `hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a + Transformer. + """ + + def __init__(self, config): + super().__init__() + image_size, patch_size = config.image_size, config.patch_size + num_channels, hidden_size = config.num_channels, config.embed_dim + image_size = image_size if isinstance(image_size, collections.abc.Iterable) else (image_size, image_size) + patch_size = patch_size if isinstance(patch_size, collections.abc.Iterable) else (patch_size, patch_size) + num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0]) + self.image_size = image_size + self.patch_size = patch_size + self.num_channels = num_channels + self.num_patches = num_patches + self.grid_size = (image_size[0] // patch_size[0], image_size[1] // patch_size[1]) + + self.projection = nn.Conv2d(num_channels, hidden_size, kernel_size=patch_size, stride=patch_size) + + def maybe_pad(self, pixel_values, height, width): + if width % self.patch_size[1] != 0: + pad_values = (0, self.patch_size[1] - width % self.patch_size[1]) + pixel_values = nn.functional.pad(pixel_values, pad_values) + if height % self.patch_size[0] != 0: + pad_values = (0, 0, 0, self.patch_size[0] - height % self.patch_size[0]) + pixel_values = nn.functional.pad(pixel_values, pad_values) + return pixel_values + + def forward(self, pixel_values: Optional[torch.FloatTensor]) -> tuple[torch.Tensor, tuple[int]]: + _, num_channels, height, width = pixel_values.shape + # pad the input to be divisible by self.patch_size, if needed + pixel_values = self.maybe_pad(pixel_values, height, width) + embeddings = self.projection(pixel_values) + _, _, height, width = embeddings.shape + output_dimensions = (height, width) + embeddings = embeddings.flatten(2).transpose(1, 2) + + return embeddings, output_dimensions + + +# Copied from transformers.models.swin.modeling_swin.SwinPatchMerging +class DonutSwinPatchMerging(nn.Module): + """ + Patch Merging Layer. + + Args: + input_resolution (`tuple[int]`): + Resolution of input feature. + dim (`int`): + Number of input channels. + norm_layer (`nn.Module`, *optional*, defaults to `nn.LayerNorm`): + Normalization layer class. + """ + + def __init__(self, input_resolution: tuple[int], dim: int, norm_layer: nn.Module = nn.LayerNorm) -> None: + super().__init__() + self.input_resolution = input_resolution + self.dim = dim + self.reduction = nn.Linear(4 * dim, 2 * dim, bias=False) + self.norm = norm_layer(4 * dim) + + def maybe_pad(self, input_feature, height, width): + should_pad = (height % 2 == 1) or (width % 2 == 1) + if should_pad: + pad_values = (0, 0, 0, width % 2, 0, height % 2) + input_feature = nn.functional.pad(input_feature, pad_values) + + return input_feature + + def forward(self, input_feature: torch.Tensor, input_dimensions: tuple[int, int]) -> torch.Tensor: + height, width = input_dimensions + # `dim` is height * width + batch_size, dim, num_channels = input_feature.shape + + input_feature = input_feature.view(batch_size, height, width, num_channels) + # pad input to be disible by width and height, if needed + input_feature = self.maybe_pad(input_feature, height, width) + # [batch_size, height/2, width/2, num_channels] + input_feature_0 = input_feature[:, 0::2, 0::2, :] + # [batch_size, height/2, width/2, num_channels] + input_feature_1 = input_feature[:, 1::2, 0::2, :] + # [batch_size, height/2, width/2, num_channels] + input_feature_2 = input_feature[:, 0::2, 1::2, :] + # [batch_size, height/2, width/2, num_channels] + input_feature_3 = input_feature[:, 1::2, 1::2, :] + # batch_size height/2 width/2 4*num_channels + input_feature = torch.cat([input_feature_0, input_feature_1, input_feature_2, input_feature_3], -1) + input_feature = input_feature.view(batch_size, -1, 4 * num_channels) # batch_size height/2*width/2 4*C + + input_feature = self.norm(input_feature) + input_feature = self.reduction(input_feature) + + return input_feature + + +# Copied from transformers.models.beit.modeling_beit.drop_path +def drop_path(input: torch.Tensor, drop_prob: float = 0.0, training: bool = False) -> torch.Tensor: + """ + Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks). + + Comment by Ross Wightman: This is the same as the DropConnect impl I created for EfficientNet, etc networks, + however, the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper... + See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for changing the + layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use 'survival rate' as the + argument. + """ + if drop_prob == 0.0 or not training: + return input + keep_prob = 1 - drop_prob + shape = (input.shape[0],) + (1,) * (input.ndim - 1) # work with diff dim tensors, not just 2D ConvNets + random_tensor = keep_prob + torch.rand(shape, dtype=input.dtype, device=input.device) + random_tensor.floor_() # binarize + output = input.div(keep_prob) * random_tensor + return output + + +# Copied from transformers.models.swin.modeling_swin.SwinDropPath +class DonutSwinDropPath(nn.Module): + """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).""" + + def __init__(self, drop_prob: Optional[float] = None) -> None: + super().__init__() + self.drop_prob = drop_prob + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + return drop_path(hidden_states, self.drop_prob, self.training) + + def extra_repr(self) -> str: + return f"p={self.drop_prob}" + + +# Copied from transformers.models.swin.modeling_swin.SwinSelfAttention with Swin->DonutSwin +class DonutSwinSelfAttention(nn.Module): + def __init__(self, config, dim, num_heads, window_size): + super().__init__() + if dim % num_heads != 0: + raise ValueError( + f"The hidden size ({dim}) is not a multiple of the number of attention heads ({num_heads})" + ) + + self.num_attention_heads = num_heads + self.attention_head_size = int(dim / num_heads) + self.all_head_size = self.num_attention_heads * self.attention_head_size + self.window_size = ( + window_size if isinstance(window_size, collections.abc.Iterable) else (window_size, window_size) + ) + + self.relative_position_bias_table = nn.Parameter( + torch.zeros((2 * self.window_size[0] - 1) * (2 * self.window_size[1] - 1), num_heads) + ) + + # get pair-wise relative position index for each token inside the window + coords_h = torch.arange(self.window_size[0]) + coords_w = torch.arange(self.window_size[1]) + coords = torch.stack(meshgrid([coords_h, coords_w], indexing="ij")) + coords_flatten = torch.flatten(coords, 1) + relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :] + relative_coords = relative_coords.permute(1, 2, 0).contiguous() + relative_coords[:, :, 0] += self.window_size[0] - 1 + relative_coords[:, :, 1] += self.window_size[1] - 1 + relative_coords[:, :, 0] *= 2 * self.window_size[1] - 1 + relative_position_index = relative_coords.sum(-1) + self.register_buffer("relative_position_index", relative_position_index) + + self.query = nn.Linear(self.all_head_size, self.all_head_size, bias=config.qkv_bias) + self.key = nn.Linear(self.all_head_size, self.all_head_size, bias=config.qkv_bias) + self.value = nn.Linear(self.all_head_size, self.all_head_size, bias=config.qkv_bias) + + self.dropout = nn.Dropout(config.attention_probs_dropout_prob) + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.FloatTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + output_attentions: Optional[bool] = False, + ) -> tuple[torch.Tensor]: + batch_size, dim, num_channels = hidden_states.shape + hidden_shape = (batch_size, dim, -1, self.attention_head_size) + + query_layer = self.query(hidden_states).view(hidden_shape).transpose(1, 2) + key_layer = self.key(hidden_states).view(hidden_shape).transpose(1, 2) + value_layer = self.value(hidden_states).view(hidden_shape).transpose(1, 2) + + # Take the dot product between "query" and "key" to get the raw attention scores. + attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2)) + + attention_scores = attention_scores / math.sqrt(self.attention_head_size) + + relative_position_bias = self.relative_position_bias_table[self.relative_position_index.view(-1)] + relative_position_bias = relative_position_bias.view( + self.window_size[0] * self.window_size[1], self.window_size[0] * self.window_size[1], -1 + ) + + relative_position_bias = relative_position_bias.permute(2, 0, 1).contiguous() + attention_scores = attention_scores + relative_position_bias.unsqueeze(0) + + if attention_mask is not None: + # Apply the attention mask is (precomputed for all layers in DonutSwinModel forward() function) + mask_shape = attention_mask.shape[0] + attention_scores = attention_scores.view( + batch_size // mask_shape, mask_shape, self.num_attention_heads, dim, dim + ) + attention_scores = attention_scores + attention_mask.unsqueeze(1).unsqueeze(0) + attention_scores = attention_scores.view(-1, self.num_attention_heads, dim, dim) + + # Normalize the attention scores to probabilities. + attention_probs = nn.functional.softmax(attention_scores, dim=-1) + + # This is actually dropping out entire tokens to attend to, which might + # seem a bit unusual, but is taken from the original Transformer paper. + attention_probs = self.dropout(attention_probs) + + # Mask heads if we want to + if head_mask is not None: + attention_probs = attention_probs * head_mask + + context_layer = torch.matmul(attention_probs, value_layer) + context_layer = context_layer.permute(0, 2, 1, 3).contiguous() + new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,) + context_layer = context_layer.view(new_context_layer_shape) + + outputs = (context_layer, attention_probs) if output_attentions else (context_layer,) + + return outputs + + +# Copied from transformers.models.swin.modeling_swin.SwinSelfOutput +class DonutSwinSelfOutput(nn.Module): + def __init__(self, config, dim): + super().__init__() + self.dense = nn.Linear(dim, dim) + self.dropout = nn.Dropout(config.attention_probs_dropout_prob) + + def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor: + hidden_states = self.dense(hidden_states) + hidden_states = self.dropout(hidden_states) + + return hidden_states + + +# Copied from transformers.models.swin.modeling_swin.SwinAttention with Swin->DonutSwin +class DonutSwinAttention(nn.Module): + def __init__(self, config, dim, num_heads, window_size): + super().__init__() + self.self = DonutSwinSelfAttention(config, dim, num_heads, window_size) + self.output = DonutSwinSelfOutput(config, dim) + self.pruned_heads = set() + + def prune_heads(self, heads): + if len(heads) == 0: + return + heads, index = find_pruneable_heads_and_indices( + heads, self.self.num_attention_heads, self.self.attention_head_size, self.pruned_heads + ) + + # Prune linear layers + self.self.query = prune_linear_layer(self.self.query, index) + self.self.key = prune_linear_layer(self.self.key, index) + self.self.value = prune_linear_layer(self.self.value, index) + self.output.dense = prune_linear_layer(self.output.dense, index, dim=1) + + # Update hyper params and store pruned heads + self.self.num_attention_heads = self.self.num_attention_heads - len(heads) + self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads + self.pruned_heads = self.pruned_heads.union(heads) + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.FloatTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + output_attentions: Optional[bool] = False, + ) -> tuple[torch.Tensor]: + self_outputs = self.self(hidden_states, attention_mask, head_mask, output_attentions) + attention_output = self.output(self_outputs[0], hidden_states) + outputs = (attention_output,) + self_outputs[1:] # add attentions if we output them + return outputs + + +# Copied from transformers.models.swin.modeling_swin.SwinIntermediate +class DonutSwinIntermediate(nn.Module): + def __init__(self, config, dim): + super().__init__() + self.dense = nn.Linear(dim, int(config.mlp_ratio * dim)) + if isinstance(config.hidden_act, str): + self.intermediate_act_fn = ACT2FN[config.hidden_act] + else: + self.intermediate_act_fn = config.hidden_act + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + hidden_states = self.dense(hidden_states) + hidden_states = self.intermediate_act_fn(hidden_states) + return hidden_states + + +# Copied from transformers.models.swin.modeling_swin.SwinOutput +class DonutSwinOutput(nn.Module): + def __init__(self, config, dim): + super().__init__() + self.dense = nn.Linear(int(config.mlp_ratio * dim), dim) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + hidden_states = self.dense(hidden_states) + hidden_states = self.dropout(hidden_states) + return hidden_states + + +# Copied from transformers.models.swin.modeling_swin.SwinLayer with Swin->DonutSwin +class DonutSwinLayer(nn.Module): + def __init__(self, config, dim, input_resolution, num_heads, drop_path_rate=0.0, shift_size=0): + super().__init__() + self.chunk_size_feed_forward = config.chunk_size_feed_forward + self.shift_size = shift_size + self.window_size = config.window_size + self.input_resolution = input_resolution + self.layernorm_before = nn.LayerNorm(dim, eps=config.layer_norm_eps) + self.attention = DonutSwinAttention(config, dim, num_heads, window_size=self.window_size) + self.drop_path = DonutSwinDropPath(drop_path_rate) if drop_path_rate > 0.0 else nn.Identity() + self.layernorm_after = nn.LayerNorm(dim, eps=config.layer_norm_eps) + self.intermediate = DonutSwinIntermediate(config, dim) + self.output = DonutSwinOutput(config, dim) + + def set_shift_and_window_size(self, input_resolution): + if min(input_resolution) <= self.window_size: + # if window size is larger than input resolution, we don't partition windows + self.shift_size = torch_int(0) + self.window_size = ( + torch.min(torch.tensor(input_resolution)) if torch.jit.is_tracing() else min(input_resolution) + ) + + def get_attn_mask(self, height, width, dtype, device): + if self.shift_size > 0: + # calculate attention mask for SW-MSA + img_mask = torch.zeros((1, height, width, 1), dtype=dtype, device=device) + height_slices = ( + slice(0, -self.window_size), + slice(-self.window_size, -self.shift_size), + slice(-self.shift_size, None), + ) + width_slices = ( + slice(0, -self.window_size), + slice(-self.window_size, -self.shift_size), + slice(-self.shift_size, None), + ) + count = 0 + for height_slice in height_slices: + for width_slice in width_slices: + img_mask[:, height_slice, width_slice, :] = count + count += 1 + + mask_windows = window_partition(img_mask, self.window_size) + mask_windows = mask_windows.view(-1, self.window_size * self.window_size) + attn_mask = mask_windows.unsqueeze(1) - mask_windows.unsqueeze(2) + attn_mask = attn_mask.masked_fill(attn_mask != 0, -100.0).masked_fill(attn_mask == 0, 0.0) + else: + attn_mask = None + return attn_mask + + def maybe_pad(self, hidden_states, height, width): + pad_right = (self.window_size - width % self.window_size) % self.window_size + pad_bottom = (self.window_size - height % self.window_size) % self.window_size + pad_values = (0, 0, 0, pad_right, 0, pad_bottom) + hidden_states = nn.functional.pad(hidden_states, pad_values) + return hidden_states, pad_values + + def forward( + self, + hidden_states: torch.Tensor, + input_dimensions: tuple[int, int], + head_mask: Optional[torch.FloatTensor] = None, + output_attentions: Optional[bool] = False, + always_partition: Optional[bool] = False, + ) -> tuple[torch.Tensor, torch.Tensor]: + if not always_partition: + self.set_shift_and_window_size(input_dimensions) + else: + pass + height, width = input_dimensions + batch_size, _, channels = hidden_states.size() + shortcut = hidden_states + + hidden_states = self.layernorm_before(hidden_states) + + hidden_states = hidden_states.view(batch_size, height, width, channels) + + # pad hidden_states to multiples of window size + hidden_states, pad_values = self.maybe_pad(hidden_states, height, width) + + _, height_pad, width_pad, _ = hidden_states.shape + # cyclic shift + if self.shift_size > 0: + shifted_hidden_states = torch.roll(hidden_states, shifts=(-self.shift_size, -self.shift_size), dims=(1, 2)) + else: + shifted_hidden_states = hidden_states + + # partition windows + hidden_states_windows = window_partition(shifted_hidden_states, self.window_size) + hidden_states_windows = hidden_states_windows.view(-1, self.window_size * self.window_size, channels) + attn_mask = self.get_attn_mask( + height_pad, width_pad, dtype=hidden_states.dtype, device=hidden_states_windows.device + ) + + attention_outputs = self.attention( + hidden_states_windows, attn_mask, head_mask, output_attentions=output_attentions + ) + + attention_output = attention_outputs[0] + + attention_windows = attention_output.view(-1, self.window_size, self.window_size, channels) + shifted_windows = window_reverse(attention_windows, self.window_size, height_pad, width_pad) + + # reverse cyclic shift + if self.shift_size > 0: + attention_windows = torch.roll(shifted_windows, shifts=(self.shift_size, self.shift_size), dims=(1, 2)) + else: + attention_windows = shifted_windows + + was_padded = pad_values[3] > 0 or pad_values[5] > 0 + if was_padded: + attention_windows = attention_windows[:, :height, :width, :].contiguous() + + attention_windows = attention_windows.view(batch_size, height * width, channels) + + hidden_states = shortcut + self.drop_path(attention_windows) + + layer_output = self.layernorm_after(hidden_states) + layer_output = self.intermediate(layer_output) + layer_output = hidden_states + self.output(layer_output) + + layer_outputs = (layer_output, attention_outputs[1]) if output_attentions else (layer_output,) + return layer_outputs + + +# Copied from transformers.models.swin.modeling_swin.SwinStage with Swin->DonutSwin +class DonutSwinStage(GradientCheckpointingLayer): + def __init__(self, config, dim, input_resolution, depth, num_heads, drop_path, downsample): + super().__init__() + self.config = config + self.dim = dim + self.blocks = nn.ModuleList( + [ + DonutSwinLayer( + config=config, + dim=dim, + input_resolution=input_resolution, + num_heads=num_heads, + drop_path_rate=drop_path[i], + shift_size=0 if (i % 2 == 0) else config.window_size // 2, + ) + for i in range(depth) + ] + ) + + # patch merging layer + if downsample is not None: + self.downsample = downsample(input_resolution, dim=dim, norm_layer=nn.LayerNorm) + else: + self.downsample = None + + self.pointing = False + + def forward( + self, + hidden_states: torch.Tensor, + input_dimensions: tuple[int, int], + head_mask: Optional[torch.FloatTensor] = None, + output_attentions: Optional[bool] = False, + always_partition: Optional[bool] = False, + ) -> tuple[torch.Tensor]: + height, width = input_dimensions + for i, layer_module in enumerate(self.blocks): + layer_head_mask = head_mask[i] if head_mask is not None else None + + layer_outputs = layer_module( + hidden_states, input_dimensions, layer_head_mask, output_attentions, always_partition + ) + + hidden_states = layer_outputs[0] + + hidden_states_before_downsampling = hidden_states + if self.downsample is not None: + height_downsampled, width_downsampled = (height + 1) // 2, (width + 1) // 2 + output_dimensions = (height, width, height_downsampled, width_downsampled) + hidden_states = self.downsample(hidden_states_before_downsampling, input_dimensions) + else: + output_dimensions = (height, width, height, width) + + stage_outputs = (hidden_states, hidden_states_before_downsampling, output_dimensions) + + if output_attentions: + stage_outputs += layer_outputs[1:] + return stage_outputs + + +# Copied from transformers.models.swin.modeling_swin.SwinEncoder with Swin->DonutSwin +class DonutSwinEncoder(nn.Module): + def __init__(self, config, grid_size): + super().__init__() + self.num_layers = len(config.depths) + self.config = config + dpr = [x.item() for x in torch.linspace(0, config.drop_path_rate, sum(config.depths), device="cpu")] + self.layers = nn.ModuleList( + [ + DonutSwinStage( + config=config, + dim=int(config.embed_dim * 2**i_layer), + input_resolution=(grid_size[0] // (2**i_layer), grid_size[1] // (2**i_layer)), + depth=config.depths[i_layer], + num_heads=config.num_heads[i_layer], + drop_path=dpr[sum(config.depths[:i_layer]) : sum(config.depths[: i_layer + 1])], + downsample=DonutSwinPatchMerging if (i_layer < self.num_layers - 1) else None, + ) + for i_layer in range(self.num_layers) + ] + ) + + self.gradient_checkpointing = False + + def forward( + self, + hidden_states: torch.Tensor, + input_dimensions: tuple[int, int], + head_mask: Optional[torch.FloatTensor] = None, + output_attentions: Optional[bool] = False, + output_hidden_states: Optional[bool] = False, + output_hidden_states_before_downsampling: Optional[bool] = False, + always_partition: Optional[bool] = False, + return_dict: Optional[bool] = True, + ) -> Union[tuple, DonutSwinEncoderOutput]: + all_hidden_states = () if output_hidden_states else None + all_reshaped_hidden_states = () if output_hidden_states else None + all_self_attentions = () if output_attentions else None + + if output_hidden_states: + batch_size, _, hidden_size = hidden_states.shape + # rearrange b (h w) c -> b c h w + reshaped_hidden_state = hidden_states.view(batch_size, *input_dimensions, hidden_size) + reshaped_hidden_state = reshaped_hidden_state.permute(0, 3, 1, 2) + all_hidden_states += (hidden_states,) + all_reshaped_hidden_states += (reshaped_hidden_state,) + + for i, layer_module in enumerate(self.layers): + layer_head_mask = head_mask[i] if head_mask is not None else None + + layer_outputs = layer_module( + hidden_states, input_dimensions, layer_head_mask, output_attentions, always_partition + ) + + hidden_states = layer_outputs[0] + hidden_states_before_downsampling = layer_outputs[1] + output_dimensions = layer_outputs[2] + + input_dimensions = (output_dimensions[-2], output_dimensions[-1]) + + if output_hidden_states and output_hidden_states_before_downsampling: + batch_size, _, hidden_size = hidden_states_before_downsampling.shape + # rearrange b (h w) c -> b c h w + # here we use the original (not downsampled) height and width + reshaped_hidden_state = hidden_states_before_downsampling.view( + batch_size, *(output_dimensions[0], output_dimensions[1]), hidden_size + ) + reshaped_hidden_state = reshaped_hidden_state.permute(0, 3, 1, 2) + all_hidden_states += (hidden_states_before_downsampling,) + all_reshaped_hidden_states += (reshaped_hidden_state,) + elif output_hidden_states and not output_hidden_states_before_downsampling: + batch_size, _, hidden_size = hidden_states.shape + # rearrange b (h w) c -> b c h w + reshaped_hidden_state = hidden_states.view(batch_size, *input_dimensions, hidden_size) + reshaped_hidden_state = reshaped_hidden_state.permute(0, 3, 1, 2) + all_hidden_states += (hidden_states,) + all_reshaped_hidden_states += (reshaped_hidden_state,) + + if output_attentions: + all_self_attentions += layer_outputs[3:] + + if not return_dict: + return tuple(v for v in [hidden_states, all_hidden_states, all_self_attentions] if v is not None) + + return DonutSwinEncoderOutput( + last_hidden_state=hidden_states, + hidden_states=all_hidden_states, + attentions=all_self_attentions, + reshaped_hidden_states=all_reshaped_hidden_states, + ) + + +@auto_docstring +# Copied from transformers.models.swin.modeling_swin.SwinPreTrainedModel with Swin->DonutSwin,swin->donut +class DonutSwinPreTrainedModel(PreTrainedModel): + config_class = DonutSwinConfig + base_model_prefix = "donut" + main_input_name = "pixel_values" + supports_gradient_checkpointing = True + _no_split_modules = ["DonutSwinStage"] + + def _init_weights(self, module): + """Initialize the weights""" + if isinstance(module, (nn.Linear, nn.Conv2d)): + # Slightly different from the TF version which uses truncated_normal for initialization + # cf https://github.com/pytorch/pytorch/pull/5617 + module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.LayerNorm): + module.bias.data.zero_() + module.weight.data.fill_(1.0) + elif isinstance(module, DonutSwinEmbeddings): + if module.mask_token is not None: + module.mask_token.data.zero_() + if module.position_embeddings is not None: + module.position_embeddings.data.zero_() + elif isinstance(module, DonutSwinSelfAttention): + module.relative_position_bias_table.data.zero_() + + +@auto_docstring +class DonutSwinModel(DonutSwinPreTrainedModel): + def __init__(self, config, add_pooling_layer=True, use_mask_token=False): + r""" + add_pooling_layer (bool, *optional*, defaults to `True`): + Whether to add a pooling layer + use_mask_token (`bool`, *optional*, defaults to `False`): + Whether to use a mask token for masked image modeling. + """ + super().__init__(config) + self.config = config + self.num_layers = len(config.depths) + self.num_features = int(config.embed_dim * 2 ** (self.num_layers - 1)) + + self.embeddings = DonutSwinEmbeddings(config, use_mask_token=use_mask_token) + self.encoder = DonutSwinEncoder(config, self.embeddings.patch_grid) + + self.pooler = nn.AdaptiveAvgPool1d(1) if add_pooling_layer else None + + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.embeddings.patch_embeddings + + def _prune_heads(self, heads_to_prune): + """ + Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base + class PreTrainedModel + """ + for layer, heads in heads_to_prune.items(): + self.encoder.layer[layer].attention.prune_heads(heads) + + @auto_docstring + def forward( + self, + pixel_values: Optional[torch.FloatTensor] = None, + bool_masked_pos: Optional[torch.BoolTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + interpolate_pos_encoding: bool = False, + return_dict: Optional[bool] = None, + ) -> Union[tuple, DonutSwinModelOutput]: + r""" + bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, num_patches)`): + Boolean masked positions. Indicates which patches are masked (1) and which aren't (0). + """ + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if pixel_values is None: + raise ValueError("You have to specify pixel_values") + + # Prepare head mask if needed + # 1.0 in head_mask indicate we keep the head + # attention_probs has shape bsz x n_heads x N x N + # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads] + # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length] + head_mask = self.get_head_mask(head_mask, len(self.config.depths)) + + embedding_output, input_dimensions = self.embeddings( + pixel_values, bool_masked_pos=bool_masked_pos, interpolate_pos_encoding=interpolate_pos_encoding + ) + + encoder_outputs = self.encoder( + embedding_output, + input_dimensions, + head_mask=head_mask, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + sequence_output = encoder_outputs[0] + + pooled_output = None + if self.pooler is not None: + pooled_output = self.pooler(sequence_output.transpose(1, 2)) + pooled_output = torch.flatten(pooled_output, 1) + + if not return_dict: + output = (sequence_output, pooled_output) + encoder_outputs[1:] + + return output + + return DonutSwinModelOutput( + last_hidden_state=sequence_output, + pooler_output=pooled_output, + hidden_states=encoder_outputs.hidden_states, + attentions=encoder_outputs.attentions, + reshaped_hidden_states=encoder_outputs.reshaped_hidden_states, + ) + + +@auto_docstring( + custom_intro=""" + DonutSwin Model transformer with an image classification head on top (a linear layer on top of the final hidden state of + the [CLS] token) e.g. for ImageNet. + + + + Note that it's possible to fine-tune DonutSwin on higher resolution images than the ones it has been trained on, by + setting `interpolate_pos_encoding` to `True` in the forward of the model. This will interpolate the pre-trained + position embeddings to the higher resolution. + + + """ +) +# Copied from transformers.models.swin.modeling_swin.SwinForImageClassification with Swin->DonutSwin,swin->donut +class DonutSwinForImageClassification(DonutSwinPreTrainedModel): + def __init__(self, config): + super().__init__(config) + + self.num_labels = config.num_labels + self.donut = DonutSwinModel(config) + + # Classifier head + self.classifier = ( + nn.Linear(self.donut.num_features, config.num_labels) if config.num_labels > 0 else nn.Identity() + ) + + # Initialize weights and apply final processing + self.post_init() + + @auto_docstring + def forward( + self, + pixel_values: Optional[torch.FloatTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + interpolate_pos_encoding: bool = False, + return_dict: Optional[bool] = None, + ) -> Union[tuple, DonutSwinImageClassifierOutput]: + r""" + labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels for computing the image classification/regression loss. Indices should be in `[0, ..., + config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If + `config.num_labels > 1` a classification loss is computed (Cross-Entropy). + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + outputs = self.donut( + pixel_values, + head_mask=head_mask, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + interpolate_pos_encoding=interpolate_pos_encoding, + return_dict=return_dict, + ) + + pooled_output = outputs[1] + + logits = self.classifier(pooled_output) + + loss = None + if labels is not None: + loss = self.loss_function(logits=logits, labels=labels, pooled_logits=logits, config=self.config) + + if not return_dict: + output = (logits,) + outputs[2:] + return ((loss,) + output) if loss is not None else output + + return DonutSwinImageClassifierOutput( + loss=loss, + logits=logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + reshaped_hidden_states=outputs.reshaped_hidden_states, + ) + + +__all__ = ["DonutSwinModel", "DonutSwinPreTrainedModel", "DonutSwinForImageClassification"] diff --git a/transformers/src/transformers/models/donut/processing_donut.py b/transformers/src/transformers/models/donut/processing_donut.py new file mode 100644 index 0000000000000000000000000000000000000000..edadc7b12678ba118ad94b91382175ec54838b9a --- /dev/null +++ b/transformers/src/transformers/models/donut/processing_donut.py @@ -0,0 +1,224 @@ +# coding=utf-8 +# Copyright 2022 The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Processor class for Donut. +""" + +import re +import warnings +from contextlib import contextmanager +from typing import Optional, Union + +from ...image_utils import ImageInput +from ...processing_utils import ProcessingKwargs, ProcessorMixin, Unpack +from ...tokenization_utils_base import PreTokenizedInput, TextInput +from ...utils import logging + + +class DonutProcessorKwargs(ProcessingKwargs, total=False): + _defaults = {} + + +logger = logging.get_logger(__name__) + + +class DonutProcessor(ProcessorMixin): + r""" + Constructs a Donut processor which wraps a Donut image processor and an XLMRoBERTa tokenizer into a single + processor. + + [`DonutProcessor`] offers all the functionalities of [`DonutImageProcessor`] and + [`XLMRobertaTokenizer`/`XLMRobertaTokenizerFast`]. See the [`~DonutProcessor.__call__`] and + [`~DonutProcessor.decode`] for more information. + + Args: + image_processor ([`DonutImageProcessor`], *optional*): + An instance of [`DonutImageProcessor`]. The image processor is a required input. + tokenizer ([`XLMRobertaTokenizer`/`XLMRobertaTokenizerFast`], *optional*): + An instance of [`XLMRobertaTokenizer`/`XLMRobertaTokenizerFast`]. The tokenizer is a required input. + """ + + attributes = ["image_processor", "tokenizer"] + image_processor_class = "AutoImageProcessor" + tokenizer_class = "AutoTokenizer" + + def __init__(self, image_processor=None, tokenizer=None, **kwargs): + feature_extractor = None + if "feature_extractor" in kwargs: + warnings.warn( + "The `feature_extractor` argument is deprecated and will be removed in v5, use `image_processor`" + " instead.", + FutureWarning, + ) + feature_extractor = kwargs.pop("feature_extractor") + + image_processor = image_processor if image_processor is not None else feature_extractor + if image_processor is None: + raise ValueError("You need to specify an `image_processor`.") + if tokenizer is None: + raise ValueError("You need to specify a `tokenizer`.") + + super().__init__(image_processor, tokenizer) + self.current_processor = self.image_processor + self._in_target_context_manager = False + + def __call__( + self, + images: ImageInput = None, + text: Optional[Union[str, list[str], TextInput, PreTokenizedInput]] = None, + audio=None, + videos=None, + **kwargs: Unpack[DonutProcessorKwargs], + ): + """ + When used in normal mode, this method forwards all its arguments to AutoImageProcessor's + [`~AutoImageProcessor.__call__`] and returns its output. If used in the context + [`~DonutProcessor.as_target_processor`] this method forwards all its arguments to DonutTokenizer's + [`~DonutTokenizer.__call__`]. Please refer to the docstring of the above two methods for more information. + """ + if self._in_target_context_manager: + return self.current_processor(images, text, **kwargs) + + if images is None and text is None: + raise ValueError("You need to specify either an `images` or `text` input to process.") + + output_kwargs = self._merge_kwargs( + DonutProcessorKwargs, + tokenizer_init_kwargs=self.tokenizer.init_kwargs, + **kwargs, + ) + + if images is not None: + inputs = self.image_processor(images, **output_kwargs["images_kwargs"]) + if text is not None: + if images is not None: + output_kwargs["text_kwargs"].setdefault("add_special_tokens", False) + encodings = self.tokenizer(text, **output_kwargs["text_kwargs"]) + + if text is None: + return inputs + elif images is None: + return encodings + else: + inputs["labels"] = encodings["input_ids"] # for BC + inputs["input_ids"] = encodings["input_ids"] + return inputs + + def batch_decode(self, *args, **kwargs): + """ + This method forwards all its arguments to DonutTokenizer's [`~PreTrainedTokenizer.batch_decode`]. Please refer + to the docstring of this method for more information. + """ + return self.tokenizer.batch_decode(*args, **kwargs) + + def decode(self, *args, **kwargs): + """ + This method forwards all its arguments to DonutTokenizer's [`~PreTrainedTokenizer.decode`]. Please refer to the + docstring of this method for more information. + """ + return self.tokenizer.decode(*args, **kwargs) + + @contextmanager + def as_target_processor(self): + """ + Temporarily sets the tokenizer for processing the input. Useful for encoding the labels when fine-tuning TrOCR. + """ + warnings.warn( + "`as_target_processor` is deprecated and will be removed in v5 of Transformers. You can process your " + "labels by using the argument `text` of the regular `__call__` method (either in the same call as " + "your images inputs, or in a separate call." + ) + self._in_target_context_manager = True + self.current_processor = self.tokenizer + yield + self.current_processor = self.image_processor + self._in_target_context_manager = False + + def token2json(self, tokens, is_inner_value=False, added_vocab=None): + """ + Convert a (generated) token sequence into an ordered JSON format. + """ + if added_vocab is None: + added_vocab = self.tokenizer.get_added_vocab() + + output = {} + + while tokens: + # We want r"" but without ReDOS risk, so do it manually in two parts + potential_start = re.search(r"" not in start_token: + break + start_token = start_token[: start_token.index(">") + 1] + key = start_token[len("")] + key_escaped = re.escape(key) + + end_token = re.search(rf"", tokens, re.IGNORECASE) + if end_token is None: + tokens = tokens.replace(start_token, "") + else: + end_token = end_token.group() + start_token_escaped = re.escape(start_token) + end_token_escaped = re.escape(end_token) + content = re.search( + f"{start_token_escaped}(.*?){end_token_escaped}", tokens, re.IGNORECASE | re.DOTALL + ) + if content is not None: + content = content.group(1).strip() + if r""): + leaf = leaf.strip() + if leaf in added_vocab and leaf[0] == "<" and leaf[-2:] == "/>": + leaf = leaf[1:-2] # for categorical special tokens + output[key].append(leaf) + if len(output[key]) == 1: + output[key] = output[key][0] + + tokens = tokens[tokens.find(end_token) + len(end_token) :].strip() + if tokens[:6] == r"": # non-leaf nodes + return [output] + self.token2json(tokens[6:], is_inner_value=True, added_vocab=added_vocab) + + if len(output): + return [output] if is_inner_value else output + else: + return [] if is_inner_value else {"text_sequence": tokens} + + @property + def feature_extractor_class(self): + warnings.warn( + "`feature_extractor_class` is deprecated and will be removed in v5. Use `image_processor_class` instead.", + FutureWarning, + ) + return self.image_processor_class + + @property + def feature_extractor(self): + warnings.warn( + "`feature_extractor` is deprecated and will be removed in v5. Use `image_processor` instead.", + FutureWarning, + ) + return self.image_processor + + +__all__ = ["DonutProcessor"] diff --git a/transformers/src/transformers/models/dpr/__init__.py b/transformers/src/transformers/models/dpr/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..9aeadbeaf416575570c280a3e15a52422a007103 --- /dev/null +++ b/transformers/src/transformers/models/dpr/__init__.py @@ -0,0 +1,30 @@ +# Copyright 2024 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import TYPE_CHECKING + +from ...utils import _LazyModule +from ...utils.import_utils import define_import_structure + + +if TYPE_CHECKING: + from .configuration_dpr import * + from .modeling_dpr import * + from .modeling_tf_dpr import * + from .tokenization_dpr import * + from .tokenization_dpr_fast import * +else: + import sys + + _file = globals()["__file__"] + sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__) diff --git a/transformers/src/transformers/models/dpr/configuration_dpr.py b/transformers/src/transformers/models/dpr/configuration_dpr.py new file mode 100644 index 0000000000000000000000000000000000000000..03b16900249329ad867ae6b13b58b89d7722a25a --- /dev/null +++ b/transformers/src/transformers/models/dpr/configuration_dpr.py @@ -0,0 +1,131 @@ +# coding=utf-8 +# Copyright 2010, DPR authors, The Hugging Face Team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""DPR model configuration""" + +from ...configuration_utils import PretrainedConfig +from ...utils import logging + + +logger = logging.get_logger(__name__) + + +class DPRConfig(PretrainedConfig): + r""" + [`DPRConfig`] is the configuration class to store the configuration of a *DPRModel*. + + This is the configuration class to store the configuration of a [`DPRContextEncoder`], [`DPRQuestionEncoder`], or a + [`DPRReader`]. It is used to instantiate the components of the DPR model according to the specified arguments, + defining the model component architectures. Instantiating a configuration with the defaults will yield a similar + configuration to that of the DPRContextEncoder + [facebook/dpr-ctx_encoder-single-nq-base](https://huggingface.co/facebook/dpr-ctx_encoder-single-nq-base) + architecture. + + This class is a subclass of [`BertConfig`]. Please check the superclass for the documentation of all kwargs. + + Args: + vocab_size (`int`, *optional*, defaults to 30522): + Vocabulary size of the DPR model. Defines the different tokens that can be represented by the *inputs_ids* + passed to the forward method of [`BertModel`]. + hidden_size (`int`, *optional*, defaults to 768): + Dimensionality of the encoder layers and the pooler layer. + num_hidden_layers (`int`, *optional*, defaults to 12): + Number of hidden layers in the Transformer encoder. + num_attention_heads (`int`, *optional*, defaults to 12): + Number of attention heads for each attention layer in the Transformer encoder. + intermediate_size (`int`, *optional*, defaults to 3072): + Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder. + hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`): + The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`, + `"relu"`, `"silu"` and `"gelu_new"` are supported. + hidden_dropout_prob (`float`, *optional*, defaults to 0.1): + The dropout probability for all fully connected layers in the embeddings, encoder, and pooler. + attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1): + The dropout ratio for the attention probabilities. + max_position_embeddings (`int`, *optional*, defaults to 512): + The maximum sequence length that this model might ever be used with. Typically set this to something large + just in case (e.g., 512 or 1024 or 2048). + type_vocab_size (`int`, *optional*, defaults to 2): + The vocabulary size of the *token_type_ids* passed into [`BertModel`]. + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + layer_norm_eps (`float`, *optional*, defaults to 1e-12): + The epsilon used by the layer normalization layers. + pad_token_id (`int`, *optional*, defaults to 0): + Padding token id. + position_embedding_type (`str`, *optional*, defaults to `"absolute"`): + Type of position embedding. Choose one of `"absolute"`, `"relative_key"`, `"relative_key_query"`. For + positional embeddings use `"absolute"`. For more information on `"relative_key"`, please refer to + [Self-Attention with Relative Position Representations (Shaw et al.)](https://huggingface.co/papers/1803.02155). + For more information on `"relative_key_query"`, please refer to *Method 4* in [Improve Transformer Models + with Better Relative Position Embeddings (Huang et al.)](https://huggingface.co/papers/2009.13658). + projection_dim (`int`, *optional*, defaults to 0): + Dimension of the projection for the context and question encoders. If it is set to zero (default), then no + projection is done. + + Example: + + ```python + >>> from transformers import DPRConfig, DPRContextEncoder + + >>> # Initializing a DPR facebook/dpr-ctx_encoder-single-nq-base style configuration + >>> configuration = DPRConfig() + + >>> # Initializing a model (with random weights) from the facebook/dpr-ctx_encoder-single-nq-base style configuration + >>> model = DPRContextEncoder(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + + model_type = "dpr" + + def __init__( + self, + vocab_size=30522, + hidden_size=768, + num_hidden_layers=12, + num_attention_heads=12, + intermediate_size=3072, + hidden_act="gelu", + hidden_dropout_prob=0.1, + attention_probs_dropout_prob=0.1, + max_position_embeddings=512, + type_vocab_size=2, + initializer_range=0.02, + layer_norm_eps=1e-12, + pad_token_id=0, + position_embedding_type="absolute", + projection_dim: int = 0, + **kwargs, + ): + super().__init__(pad_token_id=pad_token_id, **kwargs) + + self.vocab_size = vocab_size + self.hidden_size = hidden_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.hidden_act = hidden_act + self.intermediate_size = intermediate_size + self.hidden_dropout_prob = hidden_dropout_prob + self.attention_probs_dropout_prob = attention_probs_dropout_prob + self.max_position_embeddings = max_position_embeddings + self.type_vocab_size = type_vocab_size + self.initializer_range = initializer_range + self.layer_norm_eps = layer_norm_eps + self.projection_dim = projection_dim + self.position_embedding_type = position_embedding_type + + +__all__ = ["DPRConfig"] diff --git a/transformers/src/transformers/models/dpr/convert_dpr_original_checkpoint_to_pytorch.py b/transformers/src/transformers/models/dpr/convert_dpr_original_checkpoint_to_pytorch.py new file mode 100644 index 0000000000000000000000000000000000000000..5151c0972a7ed72c47d125400b918aba3a0d3c0d --- /dev/null +++ b/transformers/src/transformers/models/dpr/convert_dpr_original_checkpoint_to_pytorch.py @@ -0,0 +1,145 @@ +# Copyright 2020 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse +import collections +from pathlib import Path + +import torch +from torch.serialization import default_restore_location + +from transformers import BertConfig, DPRConfig, DPRContextEncoder, DPRQuestionEncoder, DPRReader + + +CheckpointState = collections.namedtuple( + "CheckpointState", ["model_dict", "optimizer_dict", "scheduler_dict", "offset", "epoch", "encoder_params"] +) + + +def load_states_from_checkpoint(model_file: str) -> CheckpointState: + print(f"Reading saved model from {model_file}") + state_dict = torch.load( + model_file, map_location=lambda s, l: default_restore_location(s, "cpu"), weights_only=True + ) + return CheckpointState(**state_dict) + + +class DPRState: + def __init__(self, src_file: Path): + self.src_file = src_file + + def load_dpr_model(self): + raise NotImplementedError + + @staticmethod + def from_type(comp_type: str, *args, **kwargs) -> "DPRState": + if comp_type.startswith("c"): + return DPRContextEncoderState(*args, **kwargs) + if comp_type.startswith("q"): + return DPRQuestionEncoderState(*args, **kwargs) + if comp_type.startswith("r"): + return DPRReaderState(*args, **kwargs) + else: + raise ValueError("Component type must be either 'ctx_encoder', 'question_encoder' or 'reader'.") + + +class DPRContextEncoderState(DPRState): + def load_dpr_model(self): + model = DPRContextEncoder(DPRConfig(**BertConfig.get_config_dict("google-bert/bert-base-uncased")[0])) + print(f"Loading DPR biencoder from {self.src_file}") + saved_state = load_states_from_checkpoint(self.src_file) + encoder, prefix = model.ctx_encoder, "ctx_model." + # Fix changes from https://github.com/huggingface/transformers/commit/614fef1691edb806de976756d4948ecbcd0c0ca3 + state_dict = {"bert_model.embeddings.position_ids": model.ctx_encoder.bert_model.embeddings.position_ids} + for key, value in saved_state.model_dict.items(): + if key.startswith(prefix): + key = key[len(prefix) :] + if not key.startswith("encode_proj."): + key = "bert_model." + key + state_dict[key] = value + encoder.load_state_dict(state_dict) + return model + + +class DPRQuestionEncoderState(DPRState): + def load_dpr_model(self): + model = DPRQuestionEncoder(DPRConfig(**BertConfig.get_config_dict("google-bert/bert-base-uncased")[0])) + print(f"Loading DPR biencoder from {self.src_file}") + saved_state = load_states_from_checkpoint(self.src_file) + encoder, prefix = model.question_encoder, "question_model." + # Fix changes from https://github.com/huggingface/transformers/commit/614fef1691edb806de976756d4948ecbcd0c0ca3 + state_dict = {"bert_model.embeddings.position_ids": model.question_encoder.bert_model.embeddings.position_ids} + for key, value in saved_state.model_dict.items(): + if key.startswith(prefix): + key = key[len(prefix) :] + if not key.startswith("encode_proj."): + key = "bert_model." + key + state_dict[key] = value + encoder.load_state_dict(state_dict) + return model + + +class DPRReaderState(DPRState): + def load_dpr_model(self): + model = DPRReader(DPRConfig(**BertConfig.get_config_dict("google-bert/bert-base-uncased")[0])) + print(f"Loading DPR reader from {self.src_file}") + saved_state = load_states_from_checkpoint(self.src_file) + # Fix changes from https://github.com/huggingface/transformers/commit/614fef1691edb806de976756d4948ecbcd0c0ca3 + state_dict = { + "encoder.bert_model.embeddings.position_ids": model.span_predictor.encoder.bert_model.embeddings.position_ids + } + for key, value in saved_state.model_dict.items(): + if key.startswith("encoder.") and not key.startswith("encoder.encode_proj"): + key = "encoder.bert_model." + key[len("encoder.") :] + state_dict[key] = value + model.span_predictor.load_state_dict(state_dict) + return model + + +def convert(comp_type: str, src_file: Path, dest_dir: Path): + dest_dir = Path(dest_dir) + dest_dir.mkdir(exist_ok=True) + + dpr_state = DPRState.from_type(comp_type, src_file=src_file) + model = dpr_state.load_dpr_model() + model.save_pretrained(dest_dir) + model.from_pretrained(dest_dir) # sanity check + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + # Required parameters + parser.add_argument( + "--type", type=str, help="Type of the component to convert: 'ctx_encoder', 'question_encoder' or 'reader'." + ) + parser.add_argument( + "--src", + type=str, + help=( + "Path to the dpr checkpoint file. They can be downloaded from the official DPR repo" + " https://github.com/facebookresearch/DPR. Note that in the official repo, both encoders are stored in the" + " 'retriever' checkpoints." + ), + ) + parser.add_argument("--dest", type=str, default=None, help="Path to the output PyTorch model directory.") + args = parser.parse_args() + + src_file = Path(args.src) + dest_dir = f"converted-{src_file.name}" if args.dest is None else args.dest + dest_dir = Path(dest_dir) + assert src_file.exists() + assert args.type is not None, ( + "Please specify the component type of the DPR model to convert: 'ctx_encoder', 'question_encoder' or 'reader'." + ) + convert(args.type, src_file, dest_dir) diff --git a/transformers/src/transformers/models/dpr/modeling_dpr.py b/transformers/src/transformers/models/dpr/modeling_dpr.py new file mode 100644 index 0000000000000000000000000000000000000000..3e18b3e732f3edd56212374843724c953784e857 --- /dev/null +++ b/transformers/src/transformers/models/dpr/modeling_dpr.py @@ -0,0 +1,592 @@ +# coding=utf-8 +# Copyright 2018 DPR Authors, The Hugging Face Team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""PyTorch DPR model for Open Domain Question Answering.""" + +from dataclasses import dataclass +from typing import Optional, Union + +import torch +from torch import Tensor, nn + +from ...modeling_outputs import BaseModelOutputWithPooling +from ...modeling_utils import PreTrainedModel +from ...utils import ( + ModelOutput, + auto_docstring, + logging, +) +from ..bert.modeling_bert import BertModel +from .configuration_dpr import DPRConfig + + +logger = logging.get_logger(__name__) + + +########## +# Outputs +########## + + +@dataclass +@auto_docstring( + custom_intro=""" + Class for outputs of [`DPRQuestionEncoder`]. + """ +) +class DPRContextEncoderOutput(ModelOutput): + r""" + pooler_output (`torch.FloatTensor` of shape `(batch_size, embeddings_size)`): + The DPR encoder outputs the *pooler_output* that corresponds to the context representation. Last layer + hidden-state of the first token of the sequence (classification token) further processed by a Linear layer. + This output is to be used to embed contexts for nearest neighbors queries with questions embeddings. + """ + + pooler_output: torch.FloatTensor + hidden_states: Optional[tuple[torch.FloatTensor, ...]] = None + attentions: Optional[tuple[torch.FloatTensor, ...]] = None + + +@dataclass +@auto_docstring( + custom_intro=""" + Class for outputs of [`DPRQuestionEncoder`]. + """ +) +class DPRQuestionEncoderOutput(ModelOutput): + r""" + pooler_output (`torch.FloatTensor` of shape `(batch_size, embeddings_size)`): + The DPR encoder outputs the *pooler_output* that corresponds to the question representation. Last layer + hidden-state of the first token of the sequence (classification token) further processed by a Linear layer. + This output is to be used to embed questions for nearest neighbors queries with context embeddings. + """ + + pooler_output: torch.FloatTensor + hidden_states: Optional[tuple[torch.FloatTensor, ...]] = None + attentions: Optional[tuple[torch.FloatTensor, ...]] = None + + +@dataclass +@auto_docstring( + custom_intro=""" + Class for outputs of [`DPRQuestionEncoder`]. + """ +) +class DPRReaderOutput(ModelOutput): + r""" + start_logits (`torch.FloatTensor` of shape `(n_passages, sequence_length)`): + Logits of the start index of the span for each passage. + end_logits (`torch.FloatTensor` of shape `(n_passages, sequence_length)`): + Logits of the end index of the span for each passage. + relevance_logits (`torch.FloatTensor` of shape `(n_passages, )`): + Outputs of the QA classifier of the DPRReader that corresponds to the scores of each passage to answer the + question, compared to all the other passages. + """ + + start_logits: torch.FloatTensor + end_logits: Optional[torch.FloatTensor] = None + relevance_logits: Optional[torch.FloatTensor] = None + hidden_states: Optional[tuple[torch.FloatTensor, ...]] = None + attentions: Optional[tuple[torch.FloatTensor, ...]] = None + + +@auto_docstring +class DPRPreTrainedModel(PreTrainedModel): + _supports_sdpa = True + + def _init_weights(self, module): + """Initialize the weights""" + if isinstance(module, nn.Linear): + # Slightly different from the TF version which uses truncated_normal for initialization + # cf https://github.com/pytorch/pytorch/pull/5617 + module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.Embedding): + module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + if module.padding_idx is not None: + module.weight.data[module.padding_idx].zero_() + elif isinstance(module, nn.LayerNorm): + module.bias.data.zero_() + module.weight.data.fill_(1.0) + + +class DPREncoder(DPRPreTrainedModel): + base_model_prefix = "bert_model" + + def __init__(self, config: DPRConfig): + super().__init__(config) + self.bert_model = BertModel(config, add_pooling_layer=False) + if self.bert_model.config.hidden_size <= 0: + raise ValueError("Encoder hidden_size can't be zero") + self.projection_dim = config.projection_dim + if self.projection_dim > 0: + self.encode_proj = nn.Linear(self.bert_model.config.hidden_size, config.projection_dim) + # Initialize weights and apply final processing + self.post_init() + + def forward( + self, + input_ids: Tensor, + attention_mask: Optional[Tensor] = None, + token_type_ids: Optional[Tensor] = None, + inputs_embeds: Optional[Tensor] = None, + output_attentions: bool = False, + output_hidden_states: bool = False, + return_dict: bool = False, + ) -> Union[BaseModelOutputWithPooling, tuple[Tensor, ...]]: + outputs = self.bert_model( + input_ids=input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + sequence_output = outputs[0] + pooled_output = sequence_output[:, 0, :] + + if self.projection_dim > 0: + pooled_output = self.encode_proj(pooled_output) + + if not return_dict: + return (sequence_output, pooled_output) + outputs[2:] + + return BaseModelOutputWithPooling( + last_hidden_state=sequence_output, + pooler_output=pooled_output, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + @property + def embeddings_size(self) -> int: + if self.projection_dim > 0: + return self.encode_proj.out_features + return self.bert_model.config.hidden_size + + +class DPRSpanPredictor(DPRPreTrainedModel): + base_model_prefix = "encoder" + + def __init__(self, config: DPRConfig): + super().__init__(config) + self.encoder = DPREncoder(config) + self.qa_outputs = nn.Linear(self.encoder.embeddings_size, 2) + self.qa_classifier = nn.Linear(self.encoder.embeddings_size, 1) + # Initialize weights and apply final processing + self.post_init() + + def forward( + self, + input_ids: Tensor, + attention_mask: Tensor, + inputs_embeds: Optional[Tensor] = None, + output_attentions: bool = False, + output_hidden_states: bool = False, + return_dict: bool = False, + ) -> Union[DPRReaderOutput, tuple[Tensor, ...]]: + # notations: N - number of questions in a batch, M - number of passages per questions, L - sequence length + n_passages, sequence_length = input_ids.size() if input_ids is not None else inputs_embeds.size()[:2] + # feed encoder + outputs = self.encoder( + input_ids, + attention_mask=attention_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + sequence_output = outputs[0] + + # compute logits + logits = self.qa_outputs(sequence_output) + start_logits, end_logits = logits.split(1, dim=-1) + start_logits = start_logits.squeeze(-1).contiguous() + end_logits = end_logits.squeeze(-1).contiguous() + relevance_logits = self.qa_classifier(sequence_output[:, 0, :]) + + # resize + start_logits = start_logits.view(n_passages, sequence_length) + end_logits = end_logits.view(n_passages, sequence_length) + relevance_logits = relevance_logits.view(n_passages) + + if not return_dict: + return (start_logits, end_logits, relevance_logits) + outputs[2:] + + return DPRReaderOutput( + start_logits=start_logits, + end_logits=end_logits, + relevance_logits=relevance_logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +################## +# PreTrainedModel +################## + + +class DPRPretrainedContextEncoder(DPRPreTrainedModel): + """ + An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained + models. + """ + + config_class = DPRConfig + load_tf_weights = None + base_model_prefix = "ctx_encoder" + + +class DPRPretrainedQuestionEncoder(DPRPreTrainedModel): + """ + An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained + models. + """ + + config_class = DPRConfig + load_tf_weights = None + base_model_prefix = "question_encoder" + + +class DPRPretrainedReader(DPRPreTrainedModel): + """ + An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained + models. + """ + + config_class = DPRConfig + load_tf_weights = None + base_model_prefix = "span_predictor" + + +############### +# Actual Models +############### + + +@auto_docstring( + custom_intro=""" + The bare DPRContextEncoder transformer outputting pooler outputs as context representations. + """ +) +class DPRContextEncoder(DPRPretrainedContextEncoder): + def __init__(self, config: DPRConfig): + super().__init__(config) + self.config = config + self.ctx_encoder = DPREncoder(config) + # Initialize weights and apply final processing + self.post_init() + + @auto_docstring + def forward( + self, + input_ids: Optional[Tensor] = None, + attention_mask: Optional[Tensor] = None, + token_type_ids: Optional[Tensor] = None, + inputs_embeds: Optional[Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[DPRContextEncoderOutput, tuple[Tensor, ...]]: + r""" + input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): + Indices of input sequence tokens in the vocabulary. To match pretraining, DPR input sequence should be + formatted with [CLS] and [SEP] tokens as follows: + + (a) For sequence pairs (for a pair title+text for example): + + ``` + tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP] + token_type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1 + ``` + + (b) For single sequences (for a question for example): + + ``` + tokens: [CLS] the dog is hairy . [SEP] + token_type_ids: 0 0 0 0 0 0 0 + ``` + + DPR is a model with absolute position embeddings so it's usually advised to pad the inputs on the right + rather than the left. + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + [What are input IDs?](../glossary#input-ids) + + Examples: + + ```python + >>> from transformers import DPRContextEncoder, DPRContextEncoderTokenizer + + >>> tokenizer = DPRContextEncoderTokenizer.from_pretrained("facebook/dpr-ctx_encoder-single-nq-base") + >>> model = DPRContextEncoder.from_pretrained("facebook/dpr-ctx_encoder-single-nq-base") + >>> input_ids = tokenizer("Hello, is my dog cute ?", return_tensors="pt")["input_ids"] + >>> embeddings = model(input_ids).pooler_output + ```""" + + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if input_ids is not None and inputs_embeds is not None: + raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") + elif input_ids is not None: + input_shape = input_ids.size() + elif inputs_embeds is not None: + input_shape = inputs_embeds.size()[:-1] + else: + raise ValueError("You have to specify either input_ids or inputs_embeds") + + device = input_ids.device if input_ids is not None else inputs_embeds.device + + if attention_mask is None: + attention_mask = ( + torch.ones(input_shape, device=device) + if input_ids is None + else (input_ids != self.config.pad_token_id) + ) + if token_type_ids is None: + token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device) + + outputs = self.ctx_encoder( + input_ids=input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + if not return_dict: + return outputs[1:] + return DPRContextEncoderOutput( + pooler_output=outputs.pooler_output, hidden_states=outputs.hidden_states, attentions=outputs.attentions + ) + + +@auto_docstring( + custom_intro=""" + The bare DPRQuestionEncoder transformer outputting pooler outputs as question representations. + """ +) +class DPRQuestionEncoder(DPRPretrainedQuestionEncoder): + def __init__(self, config: DPRConfig): + super().__init__(config) + self.config = config + self.question_encoder = DPREncoder(config) + # Initialize weights and apply final processing + self.post_init() + + @auto_docstring + def forward( + self, + input_ids: Optional[Tensor] = None, + attention_mask: Optional[Tensor] = None, + token_type_ids: Optional[Tensor] = None, + inputs_embeds: Optional[Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[DPRQuestionEncoderOutput, tuple[Tensor, ...]]: + r""" + input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): + Indices of input sequence tokens in the vocabulary. To match pretraining, DPR input sequence should be + formatted with [CLS] and [SEP] tokens as follows: + + (a) For sequence pairs (for a pair title+text for example): + + ``` + tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP] + token_type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1 + ``` + + (b) For single sequences (for a question for example): + + ``` + tokens: [CLS] the dog is hairy . [SEP] + token_type_ids: 0 0 0 0 0 0 0 + ``` + + DPR is a model with absolute position embeddings so it's usually advised to pad the inputs on the right + rather than the left. + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + [What are input IDs?](../glossary#input-ids) + + Examples: + + ```python + >>> from transformers import DPRQuestionEncoder, DPRQuestionEncoderTokenizer + + >>> tokenizer = DPRQuestionEncoderTokenizer.from_pretrained("facebook/dpr-question_encoder-single-nq-base") + >>> model = DPRQuestionEncoder.from_pretrained("facebook/dpr-question_encoder-single-nq-base") + >>> input_ids = tokenizer("Hello, is my dog cute ?", return_tensors="pt")["input_ids"] + >>> embeddings = model(input_ids).pooler_output + ``` + """ + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if input_ids is not None and inputs_embeds is not None: + raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") + elif input_ids is not None: + self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask) + input_shape = input_ids.size() + elif inputs_embeds is not None: + input_shape = inputs_embeds.size()[:-1] + else: + raise ValueError("You have to specify either input_ids or inputs_embeds") + + device = input_ids.device if input_ids is not None else inputs_embeds.device + + if attention_mask is None: + attention_mask = ( + torch.ones(input_shape, device=device) + if input_ids is None + else (input_ids != self.config.pad_token_id) + ) + if token_type_ids is None: + token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device) + + outputs = self.question_encoder( + input_ids=input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + if not return_dict: + return outputs[1:] + return DPRQuestionEncoderOutput( + pooler_output=outputs.pooler_output, hidden_states=outputs.hidden_states, attentions=outputs.attentions + ) + + +@auto_docstring( + custom_intro=""" + The bare DPRReader transformer outputting span predictions. + """ +) +class DPRReader(DPRPretrainedReader): + def __init__(self, config: DPRConfig): + super().__init__(config) + self.config = config + self.span_predictor = DPRSpanPredictor(config) + # Initialize weights and apply final processing + self.post_init() + + @auto_docstring + def forward( + self, + input_ids: Optional[Tensor] = None, + attention_mask: Optional[Tensor] = None, + inputs_embeds: Optional[Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[DPRReaderOutput, tuple[Tensor, ...]]: + r""" + input_ids (`tuple[torch.LongTensor]` of shapes `(n_passages, sequence_length)`): + Indices of input sequence tokens in the vocabulary. It has to be a sequence triplet with 1) the question + and 2) the passages titles and 3) the passages texts To match pretraining, DPR `input_ids` sequence should + be formatted with [CLS] and [SEP] with the format: + + `[CLS] [SEP] [SEP] ` + + DPR is a model with absolute position embeddings so it's usually advised to pad the inputs on the right + rather than the left. + + Indices can be obtained using [`DPRReaderTokenizer`]. See this class documentation for more details. + + [What are input IDs?](../glossary#input-ids) + inputs_embeds (`torch.FloatTensor` of shape `(n_passages, sequence_length, hidden_size)`, *optional*): + Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This + is useful if you want more control over how to convert `input_ids` indices into associated vectors than the + model's internal embedding lookup matrix. + + Examples: + + ```python + >>> from transformers import DPRReader, DPRReaderTokenizer + + >>> tokenizer = DPRReaderTokenizer.from_pretrained("facebook/dpr-reader-single-nq-base") + >>> model = DPRReader.from_pretrained("facebook/dpr-reader-single-nq-base") + >>> encoded_inputs = tokenizer( + ... questions=["What is love ?"], + ... titles=["Haddaway"], + ... texts=["'What Is Love' is a song recorded by the artist Haddaway"], + ... return_tensors="pt", + ... ) + >>> outputs = model(**encoded_inputs) + >>> start_logits = outputs.start_logits + >>> end_logits = outputs.end_logits + >>> relevance_logits = outputs.relevance_logits + ``` + """ + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if input_ids is not None and inputs_embeds is not None: + raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") + elif input_ids is not None: + self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask) + input_shape = input_ids.size() + elif inputs_embeds is not None: + input_shape = inputs_embeds.size()[:-1] + else: + raise ValueError("You have to specify either input_ids or inputs_embeds") + + device = input_ids.device if input_ids is not None else inputs_embeds.device + + if attention_mask is None: + attention_mask = torch.ones(input_shape, device=device) + + return self.span_predictor( + input_ids, + attention_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + +__all__ = [ + "DPRContextEncoder", + "DPRPretrainedContextEncoder", + "DPRPreTrainedModel", + "DPRPretrainedQuestionEncoder", + "DPRPretrainedReader", + "DPRQuestionEncoder", + "DPRReader", +] diff --git a/transformers/src/transformers/models/dpr/modeling_tf_dpr.py b/transformers/src/transformers/models/dpr/modeling_tf_dpr.py new file mode 100644 index 0000000000000000000000000000000000000000..1cb4e0ebad071b4fce55139ffcc9614cbd05ab67 --- /dev/null +++ b/transformers/src/transformers/models/dpr/modeling_tf_dpr.py @@ -0,0 +1,800 @@ +# coding=utf-8 +# Copyright 2018 DPR Authors, The Hugging Face Team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""TensorFlow DPR model for Open Domain Question Answering.""" + +from __future__ import annotations + +from dataclasses import dataclass +from typing import Optional, Union + +import tensorflow as tf + +from ...modeling_tf_outputs import TFBaseModelOutputWithPooling +from ...modeling_tf_utils import TFModelInputType, TFPreTrainedModel, get_initializer, keras, shape_list, unpack_inputs +from ...utils import ( + ModelOutput, + add_start_docstrings, + add_start_docstrings_to_model_forward, + logging, + replace_return_docstrings, +) +from ..bert.modeling_tf_bert import TFBertMainLayer +from .configuration_dpr import DPRConfig + + +logger = logging.get_logger(__name__) + +_CONFIG_FOR_DOC = "DPRConfig" + + +########## +# Outputs +########## + + +@dataclass +class TFDPRContextEncoderOutput(ModelOutput): + r""" + Class for outputs of [`TFDPRContextEncoder`]. + + Args: + pooler_output (`tf.Tensor` of shape `(batch_size, embeddings_size)`): + The DPR encoder outputs the *pooler_output* that corresponds to the context representation. Last layer + hidden-state of the first token of the sequence (classification token) further processed by a Linear layer. + This output is to be used to embed contexts for nearest neighbors queries with questions embeddings. + hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape + `(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + """ + + pooler_output: Optional[tf.Tensor] = None + hidden_states: tuple[tf.Tensor, ...] | None = None + attentions: tuple[tf.Tensor, ...] | None = None + + +@dataclass +class TFDPRQuestionEncoderOutput(ModelOutput): + """ + Class for outputs of [`TFDPRQuestionEncoder`]. + + Args: + pooler_output (`tf.Tensor` of shape `(batch_size, embeddings_size)`): + The DPR encoder outputs the *pooler_output* that corresponds to the question representation. Last layer + hidden-state of the first token of the sequence (classification token) further processed by a Linear layer. + This output is to be used to embed questions for nearest neighbors queries with context embeddings. + hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape + `(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + """ + + pooler_output: Optional[tf.Tensor] = None + hidden_states: tuple[tf.Tensor, ...] | None = None + attentions: tuple[tf.Tensor, ...] | None = None + + +@dataclass +class TFDPRReaderOutput(ModelOutput): + """ + Class for outputs of [`TFDPRReaderEncoder`]. + + Args: + start_logits (`tf.Tensor` of shape `(n_passages, sequence_length)`): + Logits of the start index of the span for each passage. + end_logits (`tf.Tensor` of shape `(n_passages, sequence_length)`): + Logits of the end index of the span for each passage. + relevance_logits (`tf.Tensor` of shape `(n_passages, )`): + Outputs of the QA classifier of the DPRReader that corresponds to the scores of each passage to answer the + question, compared to all the other passages. + hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape + `(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + """ + + start_logits: Optional[tf.Tensor] = None + end_logits: Optional[tf.Tensor] = None + relevance_logits: Optional[tf.Tensor] = None + hidden_states: tuple[tf.Tensor, ...] | None = None + attentions: tuple[tf.Tensor, ...] | None = None + + +class TFDPREncoderLayer(keras.layers.Layer): + base_model_prefix = "bert_model" + + def __init__(self, config: DPRConfig, **kwargs): + super().__init__(**kwargs) + + # resolve name conflict with TFBertMainLayer instead of TFBertModel + self.bert_model = TFBertMainLayer(config, add_pooling_layer=False, name="bert_model") + self.config = config + + if self.config.hidden_size <= 0: + raise ValueError("Encoder hidden_size can't be zero") + self.projection_dim = config.projection_dim + if self.projection_dim > 0: + self.encode_proj = keras.layers.Dense( + config.projection_dim, kernel_initializer=get_initializer(config.initializer_range), name="encode_proj" + ) + + @unpack_inputs + def call( + self, + input_ids: Optional[tf.Tensor] = None, + attention_mask: tf.Tensor | None = None, + token_type_ids: tf.Tensor | None = None, + inputs_embeds: tf.Tensor | None = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + training: bool = False, + ) -> Union[TFBaseModelOutputWithPooling, tuple[tf.Tensor, ...]]: + outputs = self.bert_model( + input_ids=input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + training=training, + ) + + sequence_output = outputs[0] + pooled_output = sequence_output[:, 0, :] + if self.projection_dim > 0: + pooled_output = self.encode_proj(pooled_output) + + if not return_dict: + return (sequence_output, pooled_output) + outputs[1:] + + return TFBaseModelOutputWithPooling( + last_hidden_state=sequence_output, + pooler_output=pooled_output, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + @property + def embeddings_size(self) -> int: + if self.projection_dim > 0: + return self.projection_dim + return self.bert_model.config.hidden_size + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "bert_model", None) is not None: + with tf.name_scope(self.bert_model.name): + self.bert_model.build(None) + if getattr(self, "encode_proj", None) is not None: + with tf.name_scope(self.encode_proj.name): + self.encode_proj.build(None) + + +class TFDPRSpanPredictorLayer(keras.layers.Layer): + base_model_prefix = "encoder" + + def __init__(self, config: DPRConfig, **kwargs): + super().__init__(**kwargs) + self.config = config + self.encoder = TFDPREncoderLayer(config, name="encoder") + + self.qa_outputs = keras.layers.Dense( + 2, kernel_initializer=get_initializer(config.initializer_range), name="qa_outputs" + ) + self.qa_classifier = keras.layers.Dense( + 1, kernel_initializer=get_initializer(config.initializer_range), name="qa_classifier" + ) + + @unpack_inputs + def call( + self, + input_ids: Optional[tf.Tensor] = None, + attention_mask: tf.Tensor | None = None, + inputs_embeds: tf.Tensor | None = None, + output_attentions: bool = False, + output_hidden_states: bool = False, + return_dict: bool = False, + training: bool = False, + ) -> Union[TFDPRReaderOutput, tuple[tf.Tensor, ...]]: + # notations: N - number of questions in a batch, M - number of passages per questions, L - sequence length + n_passages, sequence_length = shape_list(input_ids) if input_ids is not None else shape_list(inputs_embeds)[:2] + # feed encoder + outputs = self.encoder( + input_ids=input_ids, + attention_mask=attention_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + training=training, + ) + sequence_output = outputs[0] + + # compute logits + logits = self.qa_outputs(sequence_output) + start_logits, end_logits = tf.split(logits, 2, axis=-1) + start_logits = tf.squeeze(start_logits, axis=-1) + end_logits = tf.squeeze(end_logits, axis=-1) + relevance_logits = self.qa_classifier(sequence_output[:, 0, :]) + + # resize + start_logits = tf.reshape(start_logits, [n_passages, sequence_length]) + end_logits = tf.reshape(end_logits, [n_passages, sequence_length]) + relevance_logits = tf.reshape(relevance_logits, [n_passages]) + + if not return_dict: + return (start_logits, end_logits, relevance_logits) + outputs[2:] + + return TFDPRReaderOutput( + start_logits=start_logits, + end_logits=end_logits, + relevance_logits=relevance_logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "encoder", None) is not None: + with tf.name_scope(self.encoder.name): + self.encoder.build(None) + if getattr(self, "qa_outputs", None) is not None: + with tf.name_scope(self.qa_outputs.name): + self.qa_outputs.build([None, None, self.encoder.embeddings_size]) + if getattr(self, "qa_classifier", None) is not None: + with tf.name_scope(self.qa_classifier.name): + self.qa_classifier.build([None, None, self.encoder.embeddings_size]) + + +class TFDPRSpanPredictor(TFPreTrainedModel): + base_model_prefix = "encoder" + + def __init__(self, config: DPRConfig, **kwargs): + super().__init__(config, **kwargs) + self.encoder = TFDPRSpanPredictorLayer(config) + + @unpack_inputs + def call( + self, + input_ids: Optional[tf.Tensor] = None, + attention_mask: tf.Tensor | None = None, + token_type_ids: tf.Tensor | None = None, + inputs_embeds: tf.Tensor | None = None, + output_attentions: bool = False, + output_hidden_states: bool = False, + return_dict: bool = False, + training: bool = False, + ) -> Union[TFDPRReaderOutput, tuple[tf.Tensor, ...]]: + outputs = self.encoder( + input_ids=input_ids, + attention_mask=attention_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + training=training, + ) + + return outputs + + +class TFDPREncoder(TFPreTrainedModel): + base_model_prefix = "encoder" + + def __init__(self, config: DPRConfig, **kwargs): + super().__init__(config, **kwargs) + + self.encoder = TFDPREncoderLayer(config) + + @unpack_inputs + def call( + self, + input_ids: Optional[tf.Tensor] = None, + attention_mask: tf.Tensor | None = None, + token_type_ids: tf.Tensor | None = None, + inputs_embeds: tf.Tensor | None = None, + output_attentions: bool = False, + output_hidden_states: bool = False, + return_dict: bool = False, + training: bool = False, + ) -> Union[TFDPRReaderOutput, tuple[tf.Tensor, ...]]: + outputs = self.encoder( + input_ids=input_ids, + attention_mask=attention_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + training=training, + ) + return outputs + + +################## +# PreTrainedModel +################## + + +class TFDPRPretrainedContextEncoder(TFPreTrainedModel): + """ + An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained + models. + """ + + config_class = DPRConfig + base_model_prefix = "ctx_encoder" + + +class TFDPRPretrainedQuestionEncoder(TFPreTrainedModel): + """ + An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained + models. + """ + + config_class = DPRConfig + base_model_prefix = "question_encoder" + + +class TFDPRPretrainedReader(TFPreTrainedModel): + """ + An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained + models. + """ + + config_class = DPRConfig + base_model_prefix = "reader" + + +############### +# Actual Models +############### + + +TF_DPR_START_DOCSTRING = r""" + + This model inherits from [`TFPreTrainedModel`]. Check the superclass documentation for the generic methods the + library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads + etc.) + + This model is also a Tensorflow [keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) + subclass. Use it as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to + general usage and behavior. + + + + TensorFlow models and layers in `transformers` accept two formats as input: + + - having all inputs as keyword arguments (like PyTorch models), or + - having all inputs as a list, tuple or dict in the first positional argument. + + The reason the second format is supported is that Keras methods prefer this format when passing inputs to models + and layers. Because of this support, when using methods like `model.fit()` things should "just work" for you - just + pass your inputs and labels in any format that `model.fit()` supports! If, however, you want to use the second + format outside of Keras methods like `fit()` and `predict()`, such as when creating your own layers or models with + the Keras `Functional` API, there are three possibilities you can use to gather all the input Tensors in the first + positional argument: + + - a single Tensor with `input_ids` only and nothing else: `model(input_ids)` + - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring: + `model([input_ids, attention_mask])` or `model([input_ids, attention_mask, token_type_ids])` + - a dictionary with one or several input Tensors associated to the input names given in the docstring: + `model({"input_ids": input_ids, "token_type_ids": token_type_ids})` + + Note that when creating models and layers with + [subclassing](https://keras.io/guides/making_new_layers_and_models_via_subclassing/) then you don't need to worry + about any of this, as you can just pass inputs like you would to any other Python function! + + + + Parameters: + config ([`DPRConfig`]): Model configuration class with all the parameters of the model. + Initializing with a config file does not load the weights associated with the model, only the + configuration. Check out the [`~TFPreTrainedModel.from_pretrained`] method to load the model weights. +""" + +TF_DPR_ENCODERS_INPUTS_DOCSTRING = r""" + Args: + input_ids (`Numpy array` or `tf.Tensor` of shape `(batch_size, sequence_length)`): + Indices of input sequence tokens in the vocabulary. To match pretraining, DPR input sequence should be + formatted with [CLS] and [SEP] tokens as follows: + + (a) For sequence pairs (for a pair title+text for example): + + ``` + tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP] + token_type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1 + ``` + + (b) For single sequences (for a question for example): + + ``` + tokens: [CLS] the dog is hairy . [SEP] + token_type_ids: 0 0 0 0 0 0 0 + ``` + + DPR is a model with absolute position embeddings so it's usually advised to pad the inputs on the right + rather than the left. + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + [What are input IDs?](../glossary#input-ids) + attention_mask (`Numpy array` or `tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + token_type_ids (`Numpy array` or `tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0, + 1]`: + + - 0 corresponds to a *sentence A* token, + - 1 corresponds to a *sentence B* token. + + [What are token type IDs?](../glossary#token-type-ids) + inputs_embeds (`Numpy array` or `tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This + is useful if you want more control over how to convert `input_ids` indices into associated vectors than the + model's internal embedding lookup matrix. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned + tensors for more detail. This argument can be used only in eager mode, in graph mode the value in the + config will be used instead. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. This argument can be used only in eager mode, in graph mode the value in the config will be + used instead. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. This argument can be used in + eager mode, in graph mode the value will always be set to True. + training (`bool`, *optional*, defaults to `False`): + Whether or not to use the model in training mode (some modules like dropout modules have different + behaviors between training and evaluation). +""" + +TF_DPR_READER_INPUTS_DOCSTRING = r""" + Args: + input_ids (`Numpy array` or `tf.Tensor` of shapes `(n_passages, sequence_length)`): + Indices of input sequence tokens in the vocabulary. It has to be a sequence triplet with 1) the question + and 2) the passages titles and 3) the passages texts To match pretraining, DPR `input_ids` sequence should + be formatted with [CLS] and [SEP] with the format: + + `[CLS] [SEP] [SEP] ` + + DPR is a model with absolute position embeddings so it's usually advised to pad the inputs on the right + rather than the left. + + Indices can be obtained using [`DPRReaderTokenizer`]. See this class documentation for more details. + attention_mask (`Numpy array` or `tf.Tensor` of shape `(n_passages, sequence_length)`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + inputs_embeds (`Numpy array` or `tf.Tensor` of shape `(n_passages, sequence_length, hidden_size)`, *optional*): + Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This + is useful if you want more control over how to convert `input_ids` indices into associated vectors than the + model's internal embedding lookup matrix. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. This argument can be used only in eager mode, in graph mode the value in the config will be + used instead. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. This argument can be used in + eager mode, in graph mode the value will always be set to True. + training (`bool`, *optional*, defaults to `False`): + Whether or not to use the model in training mode (some modules like dropout modules have different + behaviors between training and evaluation). +""" + + +@add_start_docstrings( + "The bare DPRContextEncoder transformer outputting pooler outputs as context representations.", + TF_DPR_START_DOCSTRING, +) +class TFDPRContextEncoder(TFDPRPretrainedContextEncoder): + def __init__(self, config: DPRConfig, *args, **kwargs): + super().__init__(config, *args, **kwargs) + self.ctx_encoder = TFDPREncoderLayer(config, name="ctx_encoder") + + def get_input_embeddings(self): + try: + return self.ctx_encoder.bert_model.get_input_embeddings() + except AttributeError: + self.build() + return self.ctx_encoder.bert_model.get_input_embeddings() + + @unpack_inputs + @add_start_docstrings_to_model_forward(TF_DPR_ENCODERS_INPUTS_DOCSTRING) + @replace_return_docstrings(output_type=TFDPRContextEncoderOutput, config_class=_CONFIG_FOR_DOC) + def call( + self, + input_ids: TFModelInputType | None = None, + attention_mask: tf.Tensor | None = None, + token_type_ids: tf.Tensor | None = None, + inputs_embeds: tf.Tensor | None = None, + output_attentions: bool | None = None, + output_hidden_states: bool | None = None, + return_dict: bool | None = None, + training: bool = False, + ) -> TFDPRContextEncoderOutput | tuple[tf.Tensor, ...]: + r""" + Return: + + Examples: + + ```python + >>> from transformers import TFDPRContextEncoder, DPRContextEncoderTokenizer + + >>> tokenizer = DPRContextEncoderTokenizer.from_pretrained("facebook/dpr-ctx_encoder-single-nq-base") + >>> model = TFDPRContextEncoder.from_pretrained("facebook/dpr-ctx_encoder-single-nq-base", from_pt=True) + >>> input_ids = tokenizer("Hello, is my dog cute ?", return_tensors="tf")["input_ids"] + >>> embeddings = model(input_ids).pooler_output + ``` + """ + if input_ids is not None and inputs_embeds is not None: + raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") + elif input_ids is not None: + input_shape = shape_list(input_ids) + elif inputs_embeds is not None: + input_shape = shape_list(inputs_embeds)[:-1] + else: + raise ValueError("You have to specify either input_ids or inputs_embeds") + + if attention_mask is None: + attention_mask = ( + tf.ones(input_shape, dtype=tf.dtypes.int32) + if input_ids is None + else (input_ids != self.config.pad_token_id) + ) + if token_type_ids is None: + token_type_ids = tf.zeros(input_shape, dtype=tf.dtypes.int32) + + outputs = self.ctx_encoder( + input_ids=input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + training=training, + ) + + if not return_dict: + return outputs[1:] + + return TFDPRContextEncoderOutput( + pooler_output=outputs.pooler_output, hidden_states=outputs.hidden_states, attentions=outputs.attentions + ) + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "ctx_encoder", None) is not None: + with tf.name_scope(self.ctx_encoder.name): + self.ctx_encoder.build(None) + + +@add_start_docstrings( + "The bare DPRQuestionEncoder transformer outputting pooler outputs as question representations.", + TF_DPR_START_DOCSTRING, +) +class TFDPRQuestionEncoder(TFDPRPretrainedQuestionEncoder): + def __init__(self, config: DPRConfig, *args, **kwargs): + super().__init__(config, *args, **kwargs) + self.question_encoder = TFDPREncoderLayer(config, name="question_encoder") + + def get_input_embeddings(self): + try: + return self.question_encoder.bert_model.get_input_embeddings() + except AttributeError: + self.build() + return self.question_encoder.bert_model.get_input_embeddings() + + @unpack_inputs + @add_start_docstrings_to_model_forward(TF_DPR_ENCODERS_INPUTS_DOCSTRING) + @replace_return_docstrings(output_type=TFDPRQuestionEncoderOutput, config_class=_CONFIG_FOR_DOC) + def call( + self, + input_ids: TFModelInputType | None = None, + attention_mask: tf.Tensor | None = None, + token_type_ids: tf.Tensor | None = None, + inputs_embeds: tf.Tensor | None = None, + output_attentions: bool | None = None, + output_hidden_states: bool | None = None, + return_dict: bool | None = None, + training: bool = False, + ) -> TFDPRQuestionEncoderOutput | tuple[tf.Tensor, ...]: + r""" + Return: + + Examples: + + ```python + >>> from transformers import TFDPRQuestionEncoder, DPRQuestionEncoderTokenizer + + >>> tokenizer = DPRQuestionEncoderTokenizer.from_pretrained("facebook/dpr-question_encoder-single-nq-base") + >>> model = TFDPRQuestionEncoder.from_pretrained("facebook/dpr-question_encoder-single-nq-base", from_pt=True) + >>> input_ids = tokenizer("Hello, is my dog cute ?", return_tensors="tf")["input_ids"] + >>> embeddings = model(input_ids).pooler_output + ``` + """ + if input_ids is not None and inputs_embeds is not None: + raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") + elif input_ids is not None: + input_shape = shape_list(input_ids) + elif inputs_embeds is not None: + input_shape = shape_list(inputs_embeds)[:-1] + else: + raise ValueError("You have to specify either input_ids or inputs_embeds") + + if attention_mask is None: + attention_mask = ( + tf.ones(input_shape, dtype=tf.dtypes.int32) + if input_ids is None + else (input_ids != self.config.pad_token_id) + ) + if token_type_ids is None: + token_type_ids = tf.zeros(input_shape, dtype=tf.dtypes.int32) + + outputs = self.question_encoder( + input_ids=input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + training=training, + ) + + if not return_dict: + return outputs[1:] + return TFDPRQuestionEncoderOutput( + pooler_output=outputs.pooler_output, hidden_states=outputs.hidden_states, attentions=outputs.attentions + ) + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "question_encoder", None) is not None: + with tf.name_scope(self.question_encoder.name): + self.question_encoder.build(None) + + +@add_start_docstrings( + "The bare DPRReader transformer outputting span predictions.", + TF_DPR_START_DOCSTRING, +) +class TFDPRReader(TFDPRPretrainedReader): + def __init__(self, config: DPRConfig, *args, **kwargs): + super().__init__(config, *args, **kwargs) + self.span_predictor = TFDPRSpanPredictorLayer(config, name="span_predictor") + + def get_input_embeddings(self): + try: + return self.span_predictor.encoder.bert_model.get_input_embeddings() + except AttributeError: + self.build() + return self.span_predictor.encoder.bert_model.get_input_embeddings() + + @unpack_inputs + @add_start_docstrings_to_model_forward(TF_DPR_READER_INPUTS_DOCSTRING) + @replace_return_docstrings(output_type=TFDPRReaderOutput, config_class=_CONFIG_FOR_DOC) + def call( + self, + input_ids: TFModelInputType | None = None, + attention_mask: tf.Tensor | None = None, + inputs_embeds: tf.Tensor | None = None, + output_attentions: bool | None = None, + output_hidden_states: bool | None = None, + return_dict: bool | None = None, + training: bool = False, + ) -> TFDPRReaderOutput | tuple[tf.Tensor, ...]: + r""" + Return: + + Examples: + + ```python + >>> from transformers import TFDPRReader, DPRReaderTokenizer + + >>> tokenizer = DPRReaderTokenizer.from_pretrained("facebook/dpr-reader-single-nq-base") + >>> model = TFDPRReader.from_pretrained("facebook/dpr-reader-single-nq-base", from_pt=True) + >>> encoded_inputs = tokenizer( + ... questions=["What is love ?"], + ... titles=["Haddaway"], + ... texts=["'What Is Love' is a song recorded by the artist Haddaway"], + ... return_tensors="tf", + ... ) + >>> outputs = model(encoded_inputs) + >>> start_logits = outputs.start_logits + >>> end_logits = outputs.end_logits + >>> relevance_logits = outputs.relevance_logits + ``` + """ + if input_ids is not None and inputs_embeds is not None: + raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") + elif input_ids is not None: + input_shape = shape_list(input_ids) + elif inputs_embeds is not None: + input_shape = shape_list(inputs_embeds)[:-1] + else: + raise ValueError("You have to specify either input_ids or inputs_embeds") + + if attention_mask is None: + attention_mask = tf.ones(input_shape, dtype=tf.dtypes.int32) + + return self.span_predictor( + input_ids=input_ids, + attention_mask=attention_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + training=training, + ) + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "span_predictor", None) is not None: + with tf.name_scope(self.span_predictor.name): + self.span_predictor.build(None) + + +__all__ = [ + "TFDPRContextEncoder", + "TFDPRPretrainedContextEncoder", + "TFDPRPretrainedQuestionEncoder", + "TFDPRPretrainedReader", + "TFDPRQuestionEncoder", + "TFDPRReader", +] diff --git a/transformers/src/transformers/models/dpr/tokenization_dpr.py b/transformers/src/transformers/models/dpr/tokenization_dpr.py new file mode 100644 index 0000000000000000000000000000000000000000..020b235cb6bd97bda74f2e067294a9391617e00f --- /dev/null +++ b/transformers/src/transformers/models/dpr/tokenization_dpr.py @@ -0,0 +1,321 @@ +# coding=utf-8 +# Copyright 2018 The HuggingFace Inc. team, The Hugging Face Team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Tokenization classes for DPR.""" + +import collections +from typing import Optional, Union + +from ...tokenization_utils_base import BatchEncoding +from ...utils import TensorType, add_end_docstrings, add_start_docstrings, logging +from ..bert.tokenization_bert import BertTokenizer + + +logger = logging.get_logger(__name__) + +VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt", "tokenizer_file": "tokenizer.json"} + + +class DPRContextEncoderTokenizer(BertTokenizer): + r""" + Construct a DPRContextEncoder tokenizer. + + [`DPRContextEncoderTokenizer`] is identical to [`BertTokenizer`] and runs end-to-end tokenization: punctuation + splitting and wordpiece. + + Refer to superclass [`BertTokenizer`] for usage examples and documentation concerning parameters. + """ + + vocab_files_names = VOCAB_FILES_NAMES + + +class DPRQuestionEncoderTokenizer(BertTokenizer): + r""" + Constructs a DPRQuestionEncoder tokenizer. + + [`DPRQuestionEncoderTokenizer`] is identical to [`BertTokenizer`] and runs end-to-end tokenization: punctuation + splitting and wordpiece. + + Refer to superclass [`BertTokenizer`] for usage examples and documentation concerning parameters. + """ + + vocab_files_names = VOCAB_FILES_NAMES + + +DPRSpanPrediction = collections.namedtuple( + "DPRSpanPrediction", ["span_score", "relevance_score", "doc_id", "start_index", "end_index", "text"] +) + +DPRReaderOutput = collections.namedtuple("DPRReaderOutput", ["start_logits", "end_logits", "relevance_logits"]) + + +CUSTOM_DPR_READER_DOCSTRING = r""" + Return a dictionary with the token ids of the input strings and other information to give to `.decode_best_spans`. + It converts the strings of a question and different passages (title and text) in a sequence of IDs (integers), + using the tokenizer and vocabulary. The resulting `input_ids` is a matrix of size `(n_passages, sequence_length)` + with the format: + + ``` + [CLS] [SEP] [SEP] + ``` + + Args: + questions (`str` or `list[str]`): + The questions to be encoded. You can specify one question for many passages. In this case, the question + will be duplicated like `[questions] * n_passages`. Otherwise you have to specify as many questions as in + `titles` or `texts`. + titles (`str` or `list[str]`): + The passages titles to be encoded. This can be a string or a list of strings if there are several passages. + texts (`str` or `list[str]`): + The passages texts to be encoded. This can be a string or a list of strings if there are several passages. + padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `False`): + Activates and controls padding. Accepts the following values: + + - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single sequence + if provided). + - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum + acceptable input length for the model if that argument is not provided. + - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different + lengths). + truncation (`bool`, `str` or [`~tokenization_utils_base.TruncationStrategy`], *optional*, defaults to `False`): + Activates and controls truncation. Accepts the following values: + + - `True` or `'longest_first'`: Truncate to a maximum length specified with the argument `max_length` or to + the maximum acceptable input length for the model if that argument is not provided. This will truncate + token by token, removing a token from the longest sequence in the pair if a pair of sequences (or a batch + of pairs) is provided. + - `'only_first'`: Truncate to a maximum length specified with the argument `max_length` or to the maximum + acceptable input length for the model if that argument is not provided. This will only truncate the first + sequence of a pair if a pair of sequences (or a batch of pairs) is provided. + - `'only_second'`: Truncate to a maximum length specified with the argument `max_length` or to the maximum + acceptable input length for the model if that argument is not provided. This will only truncate the + second sequence of a pair if a pair of sequences (or a batch of pairs) is provided. + - `False` or `'do_not_truncate'` (default): No truncation (i.e., can output batch with sequence lengths + greater than the model maximum admissible input size). + max_length (`int`, *optional*): + Controls the maximum length to use by one of the truncation/padding parameters. + + If left unset or set to `None`, this will use the predefined model maximum length if a maximum length + is required by one of the truncation/padding parameters. If the model has no specific maximum input + length (like XLNet) truncation/padding to a maximum length will be deactivated. + return_tensors (`str` or [`~utils.TensorType`], *optional*): + If set, will return tensors instead of list of python integers. Acceptable values are: + + - `'tf'`: Return TensorFlow `tf.constant` objects. + - `'pt'`: Return PyTorch `torch.Tensor` objects. + - `'np'`: Return Numpy `np.ndarray` objects. + return_attention_mask (`bool`, *optional*): + Whether or not to return the attention mask. If not set, will return the attention mask according to the + specific tokenizer's default, defined by the `return_outputs` attribute. + + [What are attention masks?](../glossary#attention-mask) + + Returns: + `dict[str, list[list[int]]]`: A dictionary with the following keys: + + - `input_ids`: List of token ids to be fed to a model. + - `attention_mask`: List of indices specifying which tokens should be attended to by the model. + """ + + +@add_start_docstrings(CUSTOM_DPR_READER_DOCSTRING) +class CustomDPRReaderTokenizerMixin: + def __call__( + self, + questions, + titles: Optional[str] = None, + texts: Optional[str] = None, + padding: Union[bool, str] = False, + truncation: Union[bool, str] = False, + max_length: Optional[int] = None, + return_tensors: Optional[Union[str, TensorType]] = None, + return_attention_mask: Optional[bool] = None, + **kwargs, + ) -> BatchEncoding: + if titles is None and texts is None: + return super().__call__( + questions, + padding=padding, + truncation=truncation, + max_length=max_length, + return_tensors=return_tensors, + return_attention_mask=return_attention_mask, + **kwargs, + ) + elif titles is None or texts is None: + text_pair = titles if texts is None else texts + return super().__call__( + questions, + text_pair, + padding=padding, + truncation=truncation, + max_length=max_length, + return_tensors=return_tensors, + return_attention_mask=return_attention_mask, + **kwargs, + ) + titles = titles if not isinstance(titles, str) else [titles] + texts = texts if not isinstance(texts, str) else [texts] + n_passages = len(titles) + questions = questions if not isinstance(questions, str) else [questions] * n_passages + if len(titles) != len(texts): + raise ValueError( + f"There should be as many titles than texts but got {len(titles)} titles and {len(texts)} texts." + ) + encoded_question_and_titles = super().__call__(questions, titles, padding=False, truncation=False)["input_ids"] + encoded_texts = super().__call__(texts, add_special_tokens=False, padding=False, truncation=False)["input_ids"] + encoded_inputs = { + "input_ids": [ + (encoded_question_and_title + encoded_text)[:max_length] + if max_length is not None and truncation + else encoded_question_and_title + encoded_text + for encoded_question_and_title, encoded_text in zip(encoded_question_and_titles, encoded_texts) + ] + } + if return_attention_mask is not False: + attention_mask = [] + for input_ids in encoded_inputs["input_ids"]: + attention_mask.append([int(input_id != self.pad_token_id) for input_id in input_ids]) + encoded_inputs["attention_mask"] = attention_mask + return self.pad(encoded_inputs, padding=padding, max_length=max_length, return_tensors=return_tensors) + + def decode_best_spans( + self, + reader_input: BatchEncoding, + reader_output: DPRReaderOutput, + num_spans: int = 16, + max_answer_length: int = 64, + num_spans_per_passage: int = 4, + ) -> list[DPRSpanPrediction]: + """ + Get the span predictions for the extractive Q&A model. + + Returns: *List* of *DPRReaderOutput* sorted by descending *(relevance_score, span_score)*. Each + *DPRReaderOutput* is a *Tuple* with: + + - **span_score**: `float` that corresponds to the score given by the reader for this span compared to other + spans in the same passage. It corresponds to the sum of the start and end logits of the span. + - **relevance_score**: `float` that corresponds to the score of the each passage to answer the question, + compared to all the other passages. It corresponds to the output of the QA classifier of the DPRReader. + - **doc_id**: `int` the id of the passage. - **start_index**: `int` the start index of the span + (inclusive). - **end_index**: `int` the end index of the span (inclusive). + + Examples: + + ```python + >>> from transformers import DPRReader, DPRReaderTokenizer + + >>> tokenizer = DPRReaderTokenizer.from_pretrained("facebook/dpr-reader-single-nq-base") + >>> model = DPRReader.from_pretrained("facebook/dpr-reader-single-nq-base") + >>> encoded_inputs = tokenizer( + ... questions=["What is love ?"], + ... titles=["Haddaway"], + ... texts=["'What Is Love' is a song recorded by the artist Haddaway"], + ... return_tensors="pt", + ... ) + >>> outputs = model(**encoded_inputs) + >>> predicted_spans = tokenizer.decode_best_spans(encoded_inputs, outputs) + >>> print(predicted_spans[0].text) # best span + a song + ```""" + input_ids = reader_input["input_ids"] + start_logits, end_logits, relevance_logits = reader_output[:3] + n_passages = len(relevance_logits) + sorted_docs = sorted(range(n_passages), reverse=True, key=relevance_logits.__getitem__) + nbest_spans_predictions: list[DPRReaderOutput] = [] + for doc_id in sorted_docs: + sequence_ids = list(input_ids[doc_id]) + # assuming question & title information is at the beginning of the sequence + passage_offset = sequence_ids.index(self.sep_token_id, 2) + 1 # second sep id + if sequence_ids[-1] == self.pad_token_id: + sequence_len = sequence_ids.index(self.pad_token_id) + else: + sequence_len = len(sequence_ids) + + best_spans = self._get_best_spans( + start_logits=start_logits[doc_id][passage_offset:sequence_len], + end_logits=end_logits[doc_id][passage_offset:sequence_len], + max_answer_length=max_answer_length, + top_spans=num_spans_per_passage, + ) + for start_index, end_index in best_spans: + start_index += passage_offset + end_index += passage_offset + nbest_spans_predictions.append( + DPRSpanPrediction( + span_score=start_logits[doc_id][start_index] + end_logits[doc_id][end_index], + relevance_score=relevance_logits[doc_id], + doc_id=doc_id, + start_index=start_index, + end_index=end_index, + text=self.decode(sequence_ids[start_index : end_index + 1]), + ) + ) + if len(nbest_spans_predictions) >= num_spans: + break + return nbest_spans_predictions[:num_spans] + + def _get_best_spans( + self, + start_logits: list[int], + end_logits: list[int], + max_answer_length: int, + top_spans: int, + ) -> list[DPRSpanPrediction]: + """ + Finds the best answer span for the extractive Q&A model for one passage. It returns the best span by descending + `span_score` order and keeping max `top_spans` spans. Spans longer that `max_answer_length` are ignored. + """ + scores = [] + for start_index, start_score in enumerate(start_logits): + for answer_length, end_score in enumerate(end_logits[start_index : start_index + max_answer_length]): + scores.append(((start_index, start_index + answer_length), start_score + end_score)) + scores = sorted(scores, key=lambda x: x[1], reverse=True) + chosen_span_intervals = [] + for (start_index, end_index), score in scores: + if start_index > end_index: + raise ValueError(f"Wrong span indices: [{start_index}:{end_index}]") + length = end_index - start_index + 1 + if length > max_answer_length: + raise ValueError(f"Span is too long: {length} > {max_answer_length}") + if any( + start_index <= prev_start_index <= prev_end_index <= end_index + or prev_start_index <= start_index <= end_index <= prev_end_index + for (prev_start_index, prev_end_index) in chosen_span_intervals + ): + continue + chosen_span_intervals.append((start_index, end_index)) + + if len(chosen_span_intervals) == top_spans: + break + return chosen_span_intervals + + +@add_end_docstrings(CUSTOM_DPR_READER_DOCSTRING) +class DPRReaderTokenizer(CustomDPRReaderTokenizerMixin, BertTokenizer): + r""" + Construct a DPRReader tokenizer. + + [`DPRReaderTokenizer`] is almost identical to [`BertTokenizer`] and runs end-to-end tokenization: punctuation + splitting and wordpiece. The difference is that is has three inputs strings: question, titles and texts that are + combined to be fed to the [`DPRReader`] model. + + Refer to superclass [`BertTokenizer`] for usage examples and documentation concerning parameters. + """ + + vocab_files_names = VOCAB_FILES_NAMES + model_input_names = ["input_ids", "attention_mask"] + + +__all__ = ["DPRContextEncoderTokenizer", "DPRQuestionEncoderTokenizer", "DPRReaderOutput", "DPRReaderTokenizer"] diff --git a/transformers/src/transformers/models/dpr/tokenization_dpr_fast.py b/transformers/src/transformers/models/dpr/tokenization_dpr_fast.py new file mode 100644 index 0000000000000000000000000000000000000000..dbf745291745c3ac29472391822b09ba68d933a4 --- /dev/null +++ b/transformers/src/transformers/models/dpr/tokenization_dpr_fast.py @@ -0,0 +1,321 @@ +# coding=utf-8 +# Copyright 2018 The HuggingFace Inc. team, The Hugging Face Team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Tokenization classes for DPR.""" + +import collections +from typing import Optional, Union + +from ...tokenization_utils_base import BatchEncoding +from ...utils import TensorType, add_end_docstrings, add_start_docstrings, logging +from ..bert.tokenization_bert_fast import BertTokenizerFast +from .tokenization_dpr import DPRContextEncoderTokenizer, DPRQuestionEncoderTokenizer, DPRReaderTokenizer + + +logger = logging.get_logger(__name__) + +VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt", "tokenizer_file": "tokenizer.json"} + + +class DPRContextEncoderTokenizerFast(BertTokenizerFast): + r""" + Construct a "fast" DPRContextEncoder tokenizer (backed by HuggingFace's *tokenizers* library). + + [`DPRContextEncoderTokenizerFast`] is identical to [`BertTokenizerFast`] and runs end-to-end tokenization: + punctuation splitting and wordpiece. + + Refer to superclass [`BertTokenizerFast`] for usage examples and documentation concerning parameters. + """ + + vocab_files_names = VOCAB_FILES_NAMES + slow_tokenizer_class = DPRContextEncoderTokenizer + + +class DPRQuestionEncoderTokenizerFast(BertTokenizerFast): + r""" + Constructs a "fast" DPRQuestionEncoder tokenizer (backed by HuggingFace's *tokenizers* library). + + [`DPRQuestionEncoderTokenizerFast`] is identical to [`BertTokenizerFast`] and runs end-to-end tokenization: + punctuation splitting and wordpiece. + + Refer to superclass [`BertTokenizerFast`] for usage examples and documentation concerning parameters. + """ + + vocab_files_names = VOCAB_FILES_NAMES + slow_tokenizer_class = DPRQuestionEncoderTokenizer + + +DPRSpanPrediction = collections.namedtuple( + "DPRSpanPrediction", ["span_score", "relevance_score", "doc_id", "start_index", "end_index", "text"] +) + +DPRReaderOutput = collections.namedtuple("DPRReaderOutput", ["start_logits", "end_logits", "relevance_logits"]) + + +CUSTOM_DPR_READER_DOCSTRING = r""" + Return a dictionary with the token ids of the input strings and other information to give to `.decode_best_spans`. + It converts the strings of a question and different passages (title and text) in a sequence of IDs (integers), + using the tokenizer and vocabulary. The resulting `input_ids` is a matrix of size `(n_passages, sequence_length)` + with the format: + + [CLS] [SEP] [SEP] + + Args: + questions (`str` or `list[str]`): + The questions to be encoded. You can specify one question for many passages. In this case, the question + will be duplicated like `[questions] * n_passages`. Otherwise you have to specify as many questions as in + `titles` or `texts`. + titles (`str` or `list[str]`): + The passages titles to be encoded. This can be a string or a list of strings if there are several passages. + texts (`str` or `list[str]`): + The passages texts to be encoded. This can be a string or a list of strings if there are several passages. + padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `False`): + Activates and controls padding. Accepts the following values: + + - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single sequence + if provided). + - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum + acceptable input length for the model if that argument is not provided. + - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different + lengths). + truncation (`bool`, `str` or [`~tokenization_utils_base.TruncationStrategy`], *optional*, defaults to `False`): + Activates and controls truncation. Accepts the following values: + + - `True` or `'longest_first'`: Truncate to a maximum length specified with the argument `max_length` or to + the maximum acceptable input length for the model if that argument is not provided. This will truncate + token by token, removing a token from the longest sequence in the pair if a pair of sequences (or a batch + of pairs) is provided. + - `'only_first'`: Truncate to a maximum length specified with the argument `max_length` or to the maximum + acceptable input length for the model if that argument is not provided. This will only truncate the first + sequence of a pair if a pair of sequences (or a batch of pairs) is provided. + - `'only_second'`: Truncate to a maximum length specified with the argument `max_length` or to the maximum + acceptable input length for the model if that argument is not provided. This will only truncate the + second sequence of a pair if a pair of sequences (or a batch of pairs) is provided. + - `False` or `'do_not_truncate'` (default): No truncation (i.e., can output batch with sequence lengths + greater than the model maximum admissible input size). + max_length (`int`, *optional*): + Controls the maximum length to use by one of the truncation/padding parameters. + + If left unset or set to `None`, this will use the predefined model maximum length if a maximum length + is required by one of the truncation/padding parameters. If the model has no specific maximum input + length (like XLNet) truncation/padding to a maximum length will be deactivated. + return_tensors (`str` or [`~utils.TensorType`], *optional*): + If set, will return tensors instead of list of python integers. Acceptable values are: + + - `'tf'`: Return TensorFlow `tf.constant` objects. + - `'pt'`: Return PyTorch `torch.Tensor` objects. + - `'np'`: Return Numpy `np.ndarray` objects. + return_attention_mask (`bool`, *optional*): + Whether or not to return the attention mask. If not set, will return the attention mask according to the + specific tokenizer's default, defined by the `return_outputs` attribute. + + [What are attention masks?](../glossary#attention-mask) + + Return: + `dict[str, list[list[int]]]`: A dictionary with the following keys: + + - `input_ids`: List of token ids to be fed to a model. + - `attention_mask`: List of indices specifying which tokens should be attended to by the model. + """ + + +@add_start_docstrings(CUSTOM_DPR_READER_DOCSTRING) +class CustomDPRReaderTokenizerMixin: + def __call__( + self, + questions, + titles: Optional[str] = None, + texts: Optional[str] = None, + padding: Union[bool, str] = False, + truncation: Union[bool, str] = False, + max_length: Optional[int] = None, + return_tensors: Optional[Union[str, TensorType]] = None, + return_attention_mask: Optional[bool] = None, + **kwargs, + ) -> BatchEncoding: + if titles is None and texts is None: + return super().__call__( + questions, + padding=padding, + truncation=truncation, + max_length=max_length, + return_tensors=return_tensors, + return_attention_mask=return_attention_mask, + **kwargs, + ) + elif titles is None or texts is None: + text_pair = titles if texts is None else texts + return super().__call__( + questions, + text_pair, + padding=padding, + truncation=truncation, + max_length=max_length, + return_tensors=return_tensors, + return_attention_mask=return_attention_mask, + **kwargs, + ) + titles = titles if not isinstance(titles, str) else [titles] + texts = texts if not isinstance(texts, str) else [texts] + n_passages = len(titles) + questions = questions if not isinstance(questions, str) else [questions] * n_passages + assert len(titles) == len(texts), ( + f"There should be as many titles than texts but got {len(titles)} titles and {len(texts)} texts." + ) + encoded_question_and_titles = super().__call__(questions, titles, padding=False, truncation=False)["input_ids"] + encoded_texts = super().__call__(texts, add_special_tokens=False, padding=False, truncation=False)["input_ids"] + encoded_inputs = { + "input_ids": [ + (encoded_question_and_title + encoded_text)[:max_length] + if max_length is not None and truncation + else encoded_question_and_title + encoded_text + for encoded_question_and_title, encoded_text in zip(encoded_question_and_titles, encoded_texts) + ] + } + if return_attention_mask is not False: + attention_mask = [] + for input_ids in encoded_inputs["input_ids"]: + attention_mask.append([int(input_id != self.pad_token_id) for input_id in input_ids]) + encoded_inputs["attention_mask"] = attention_mask + return self.pad(encoded_inputs, padding=padding, max_length=max_length, return_tensors=return_tensors) + + def decode_best_spans( + self, + reader_input: BatchEncoding, + reader_output: DPRReaderOutput, + num_spans: int = 16, + max_answer_length: int = 64, + num_spans_per_passage: int = 4, + ) -> list[DPRSpanPrediction]: + """ + Get the span predictions for the extractive Q&A model. + + Returns: *List* of *DPRReaderOutput* sorted by descending *(relevance_score, span_score)*. Each + *DPRReaderOutput* is a *Tuple* with: + + - **span_score**: `float` that corresponds to the score given by the reader for this span compared to other + spans in the same passage. It corresponds to the sum of the start and end logits of the span. + - **relevance_score**: `float` that corresponds to the score of the each passage to answer the question, + compared to all the other passages. It corresponds to the output of the QA classifier of the DPRReader. + - **doc_id**: `int` the id of the passage. - ***start_index**: `int` the start index of the span + (inclusive). - **end_index**: `int` the end index of the span (inclusive). + + Examples: + + ```python + >>> from transformers import DPRReader, DPRReaderTokenizer + + >>> tokenizer = DPRReaderTokenizer.from_pretrained("facebook/dpr-reader-single-nq-base") + >>> model = DPRReader.from_pretrained("facebook/dpr-reader-single-nq-base") + >>> encoded_inputs = tokenizer( + ... questions=["What is love ?"], + ... titles=["Haddaway"], + ... texts=["'What Is Love' is a song recorded by the artist Haddaway"], + ... return_tensors="pt", + ... ) + >>> outputs = model(**encoded_inputs) + >>> predicted_spans = tokenizer.decode_best_spans(encoded_inputs, outputs) + >>> print(predicted_spans[0].text) # best span + a song + ```""" + input_ids = reader_input["input_ids"] + start_logits, end_logits, relevance_logits = reader_output[:3] + n_passages = len(relevance_logits) + sorted_docs = sorted(range(n_passages), reverse=True, key=relevance_logits.__getitem__) + nbest_spans_predictions: list[DPRReaderOutput] = [] + for doc_id in sorted_docs: + sequence_ids = list(input_ids[doc_id]) + # assuming question & title information is at the beginning of the sequence + passage_offset = sequence_ids.index(self.sep_token_id, 2) + 1 # second sep id + if sequence_ids[-1] == self.pad_token_id: + sequence_len = sequence_ids.index(self.pad_token_id) + else: + sequence_len = len(sequence_ids) + + best_spans = self._get_best_spans( + start_logits=start_logits[doc_id][passage_offset:sequence_len], + end_logits=end_logits[doc_id][passage_offset:sequence_len], + max_answer_length=max_answer_length, + top_spans=num_spans_per_passage, + ) + for start_index, end_index in best_spans: + start_index += passage_offset + end_index += passage_offset + nbest_spans_predictions.append( + DPRSpanPrediction( + span_score=start_logits[doc_id][start_index] + end_logits[doc_id][end_index], + relevance_score=relevance_logits[doc_id], + doc_id=doc_id, + start_index=start_index, + end_index=end_index, + text=self.decode(sequence_ids[start_index : end_index + 1]), + ) + ) + if len(nbest_spans_predictions) >= num_spans: + break + return nbest_spans_predictions[:num_spans] + + def _get_best_spans( + self, + start_logits: list[int], + end_logits: list[int], + max_answer_length: int, + top_spans: int, + ) -> list[DPRSpanPrediction]: + """ + Finds the best answer span for the extractive Q&A model for one passage. It returns the best span by descending + `span_score` order and keeping max `top_spans` spans. Spans longer that `max_answer_length` are ignored. + """ + scores = [] + for start_index, start_score in enumerate(start_logits): + for answer_length, end_score in enumerate(end_logits[start_index : start_index + max_answer_length]): + scores.append(((start_index, start_index + answer_length), start_score + end_score)) + scores = sorted(scores, key=lambda x: x[1], reverse=True) + chosen_span_intervals = [] + for (start_index, end_index), score in scores: + assert start_index <= end_index, f"Wrong span indices: [{start_index}:{end_index}]" + length = end_index - start_index + 1 + assert length <= max_answer_length, f"Span is too long: {length} > {max_answer_length}" + if any( + start_index <= prev_start_index <= prev_end_index <= end_index + or prev_start_index <= start_index <= end_index <= prev_end_index + for (prev_start_index, prev_end_index) in chosen_span_intervals + ): + continue + chosen_span_intervals.append((start_index, end_index)) + + if len(chosen_span_intervals) == top_spans: + break + return chosen_span_intervals + + +@add_end_docstrings(CUSTOM_DPR_READER_DOCSTRING) +class DPRReaderTokenizerFast(CustomDPRReaderTokenizerMixin, BertTokenizerFast): + r""" + Constructs a "fast" DPRReader tokenizer (backed by HuggingFace's *tokenizers* library). + + [`DPRReaderTokenizerFast`] is almost identical to [`BertTokenizerFast`] and runs end-to-end tokenization: + punctuation splitting and wordpiece. The difference is that is has three inputs strings: question, titles and texts + that are combined to be fed to the [`DPRReader`] model. + + Refer to superclass [`BertTokenizerFast`] for usage examples and documentation concerning parameters. + + """ + + vocab_files_names = VOCAB_FILES_NAMES + model_input_names = ["input_ids", "attention_mask"] + slow_tokenizer_class = DPRReaderTokenizer + + +__all__ = ["DPRContextEncoderTokenizerFast", "DPRQuestionEncoderTokenizerFast", "DPRReaderTokenizerFast"] diff --git a/transformers/src/transformers/models/electra/__init__.py b/transformers/src/transformers/models/electra/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..a78ed5c42aea51038335efabde5b03e333592ed6 --- /dev/null +++ b/transformers/src/transformers/models/electra/__init__.py @@ -0,0 +1,31 @@ +# Copyright 2024 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import TYPE_CHECKING + +from ...utils import _LazyModule +from ...utils.import_utils import define_import_structure + + +if TYPE_CHECKING: + from .configuration_electra import * + from .modeling_electra import * + from .modeling_flax_electra import * + from .modeling_tf_electra import * + from .tokenization_electra import * + from .tokenization_electra_fast import * +else: + import sys + + _file = globals()["__file__"] + sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__) diff --git a/transformers/src/transformers/models/electra/configuration_electra.py b/transformers/src/transformers/models/electra/configuration_electra.py new file mode 100644 index 0000000000000000000000000000000000000000..f12756d976b35ee3a4f333483b1b4e6e1a07fb7e --- /dev/null +++ b/transformers/src/transformers/models/electra/configuration_electra.py @@ -0,0 +1,187 @@ +# coding=utf-8 +# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. +# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""ELECTRA model configuration""" + +from collections import OrderedDict +from collections.abc import Mapping + +from ...configuration_utils import PretrainedConfig +from ...onnx import OnnxConfig +from ...utils import logging + + +logger = logging.get_logger(__name__) + + +class ElectraConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`ElectraModel`] or a [`TFElectraModel`]. It is + used to instantiate a ELECTRA model according to the specified arguments, defining the model architecture. + Instantiating a configuration with the defaults will yield a similar configuration to that of the ELECTRA + [google/electra-small-discriminator](https://huggingface.co/google/electra-small-discriminator) architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + + Args: + vocab_size (`int`, *optional*, defaults to 30522): + Vocabulary size of the ELECTRA model. Defines the number of different tokens that can be represented by the + `inputs_ids` passed when calling [`ElectraModel`] or [`TFElectraModel`]. + embedding_size (`int`, *optional*, defaults to 128): + Dimensionality of the encoder layers and the pooler layer. + hidden_size (`int`, *optional*, defaults to 256): + Dimensionality of the encoder layers and the pooler layer. + num_hidden_layers (`int`, *optional*, defaults to 12): + Number of hidden layers in the Transformer encoder. + num_attention_heads (`int`, *optional*, defaults to 4): + Number of attention heads for each attention layer in the Transformer encoder. + intermediate_size (`int`, *optional*, defaults to 1024): + Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder. + hidden_act (`str` or `Callable`, *optional*, defaults to `"gelu"`): + The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`, + `"relu"`, `"silu"` and `"gelu_new"` are supported. + hidden_dropout_prob (`float`, *optional*, defaults to 0.1): + The dropout probability for all fully connected layers in the embeddings, encoder, and pooler. + attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1): + The dropout ratio for the attention probabilities. + max_position_embeddings (`int`, *optional*, defaults to 512): + The maximum sequence length that this model might ever be used with. Typically set this to something large + just in case (e.g., 512 or 1024 or 2048). + type_vocab_size (`int`, *optional*, defaults to 2): + The vocabulary size of the `token_type_ids` passed when calling [`ElectraModel`] or [`TFElectraModel`]. + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + layer_norm_eps (`float`, *optional*, defaults to 1e-12): + The epsilon used by the layer normalization layers. + summary_type (`str`, *optional*, defaults to `"first"`): + Argument used when doing sequence summary. Used in the sequence classification and multiple choice models. + + Has to be one of the following options: + + - `"last"`: Take the last token hidden state (like XLNet). + - `"first"`: Take the first token hidden state (like BERT). + - `"mean"`: Take the mean of all tokens hidden states. + - `"cls_index"`: Supply a Tensor of classification token position (like GPT/GPT-2). + - `"attn"`: Not implemented now, use multi-head attention. + summary_use_proj (`bool`, *optional*, defaults to `True`): + Argument used when doing sequence summary. Used in the sequence classification and multiple choice models. + + Whether or not to add a projection after the vector extraction. + summary_activation (`str`, *optional*): + Argument used when doing sequence summary. Used in the sequence classification and multiple choice models. + + Pass `"gelu"` for a gelu activation to the output, any other value will result in no activation. + summary_last_dropout (`float`, *optional*, defaults to 0.0): + Argument used when doing sequence summary. Used in the sequence classification and multiple choice models. + + The dropout ratio to be used after the projection and activation. + position_embedding_type (`str`, *optional*, defaults to `"absolute"`): + Type of position embedding. Choose one of `"absolute"`, `"relative_key"`, `"relative_key_query"`. For + positional embeddings use `"absolute"`. For more information on `"relative_key"`, please refer to + [Self-Attention with Relative Position Representations (Shaw et al.)](https://huggingface.co/papers/1803.02155). + For more information on `"relative_key_query"`, please refer to *Method 4* in [Improve Transformer Models + with Better Relative Position Embeddings (Huang et al.)](https://huggingface.co/papers/2009.13658). + use_cache (`bool`, *optional*, defaults to `True`): + Whether or not the model should return the last key/values attentions (not used by all models). Only + relevant if `config.is_decoder=True`. + classifier_dropout (`float`, *optional*): + The dropout ratio for the classification head. + + Examples: + + ```python + >>> from transformers import ElectraConfig, ElectraModel + + >>> # Initializing a ELECTRA electra-base-uncased style configuration + >>> configuration = ElectraConfig() + + >>> # Initializing a model (with random weights) from the electra-base-uncased style configuration + >>> model = ElectraModel(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + + model_type = "electra" + + def __init__( + self, + vocab_size=30522, + embedding_size=128, + hidden_size=256, + num_hidden_layers=12, + num_attention_heads=4, + intermediate_size=1024, + hidden_act="gelu", + hidden_dropout_prob=0.1, + attention_probs_dropout_prob=0.1, + max_position_embeddings=512, + type_vocab_size=2, + initializer_range=0.02, + layer_norm_eps=1e-12, + summary_type="first", + summary_use_proj=True, + summary_activation="gelu", + summary_last_dropout=0.1, + pad_token_id=0, + position_embedding_type="absolute", + use_cache=True, + classifier_dropout=None, + **kwargs, + ): + super().__init__(pad_token_id=pad_token_id, **kwargs) + + self.vocab_size = vocab_size + self.embedding_size = embedding_size + self.hidden_size = hidden_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.intermediate_size = intermediate_size + self.hidden_act = hidden_act + self.hidden_dropout_prob = hidden_dropout_prob + self.attention_probs_dropout_prob = attention_probs_dropout_prob + self.max_position_embeddings = max_position_embeddings + self.type_vocab_size = type_vocab_size + self.initializer_range = initializer_range + self.layer_norm_eps = layer_norm_eps + + self.summary_type = summary_type + self.summary_use_proj = summary_use_proj + self.summary_activation = summary_activation + self.summary_last_dropout = summary_last_dropout + self.position_embedding_type = position_embedding_type + self.use_cache = use_cache + self.classifier_dropout = classifier_dropout + + +class ElectraOnnxConfig(OnnxConfig): + @property + def inputs(self) -> Mapping[str, Mapping[int, str]]: + if self.task == "multiple-choice": + dynamic_axis = {0: "batch", 1: "choice", 2: "sequence"} + else: + dynamic_axis = {0: "batch", 1: "sequence"} + return OrderedDict( + [ + ("input_ids", dynamic_axis), + ("attention_mask", dynamic_axis), + ("token_type_ids", dynamic_axis), + ] + ) + + +__all__ = ["ElectraConfig", "ElectraOnnxConfig"] diff --git a/transformers/src/transformers/models/electra/convert_electra_original_tf_checkpoint_to_pytorch.py b/transformers/src/transformers/models/electra/convert_electra_original_tf_checkpoint_to_pytorch.py new file mode 100644 index 0000000000000000000000000000000000000000..b0abc30cd758743b243baabbf1298bcc2e1e595e --- /dev/null +++ b/transformers/src/transformers/models/electra/convert_electra_original_tf_checkpoint_to_pytorch.py @@ -0,0 +1,79 @@ +# coding=utf-8 +# Copyright 2018 The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Convert ELECTRA checkpoint.""" + +import argparse + +import torch + +from transformers import ElectraConfig, ElectraForMaskedLM, ElectraForPreTraining, load_tf_weights_in_electra +from transformers.utils import logging + + +logging.set_verbosity_info() + + +def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, config_file, pytorch_dump_path, discriminator_or_generator): + # Initialise PyTorch model + config = ElectraConfig.from_json_file(config_file) + print(f"Building PyTorch model from configuration: {config}") + + if discriminator_or_generator == "discriminator": + model = ElectraForPreTraining(config) + elif discriminator_or_generator == "generator": + model = ElectraForMaskedLM(config) + else: + raise ValueError("The discriminator_or_generator argument should be either 'discriminator' or 'generator'") + + # Load weights from tf checkpoint + load_tf_weights_in_electra( + model, config, tf_checkpoint_path, discriminator_or_generator=discriminator_or_generator + ) + + # Save pytorch-model + print(f"Save PyTorch model to {pytorch_dump_path}") + torch.save(model.state_dict(), pytorch_dump_path) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + # Required parameters + parser.add_argument( + "--tf_checkpoint_path", default=None, type=str, required=True, help="Path to the TensorFlow checkpoint path." + ) + parser.add_argument( + "--config_file", + default=None, + type=str, + required=True, + help="The config json file corresponding to the pre-trained model. \nThis specifies the model architecture.", + ) + parser.add_argument( + "--pytorch_dump_path", default=None, type=str, required=True, help="Path to the output PyTorch model." + ) + parser.add_argument( + "--discriminator_or_generator", + default=None, + type=str, + required=True, + help=( + "Whether to export the generator or the discriminator. Should be a string, either 'discriminator' or " + "'generator'." + ), + ) + args = parser.parse_args() + convert_tf_checkpoint_to_pytorch( + args.tf_checkpoint_path, args.config_file, args.pytorch_dump_path, args.discriminator_or_generator + ) diff --git a/transformers/src/transformers/models/electra/modeling_electra.py b/transformers/src/transformers/models/electra/modeling_electra.py new file mode 100644 index 0000000000000000000000000000000000000000..613dea9473b14296dec8fa5c910d69cf7b9f2b82 --- /dev/null +++ b/transformers/src/transformers/models/electra/modeling_electra.py @@ -0,0 +1,1598 @@ +# coding=utf-8 +# Copyright 2019 The Google AI Language Team Authors and The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""PyTorch ELECTRA model.""" + +import math +import os +from dataclasses import dataclass +from typing import Callable, Optional, Union + +import torch +import torch.utils.checkpoint +from torch import nn +from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss + +from ...activations import ACT2FN, get_activation +from ...generation import GenerationMixin +from ...modeling_layers import GradientCheckpointingLayer +from ...modeling_outputs import ( + BaseModelOutputWithCrossAttentions, + BaseModelOutputWithPastAndCrossAttentions, + CausalLMOutputWithCrossAttentions, + MaskedLMOutput, + MultipleChoiceModelOutput, + QuestionAnsweringModelOutput, + SequenceClassifierOutput, + TokenClassifierOutput, +) +from ...modeling_utils import PreTrainedModel +from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer +from ...utils import ( + ModelOutput, + auto_docstring, + logging, +) +from .configuration_electra import ElectraConfig + + +logger = logging.get_logger(__name__) + + +def load_tf_weights_in_electra(model, config, tf_checkpoint_path, discriminator_or_generator="discriminator"): + """Load tf checkpoints in a pytorch model.""" + try: + import re + + import numpy as np + import tensorflow as tf + except ImportError: + logger.error( + "Loading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see " + "https://www.tensorflow.org/install/ for installation instructions." + ) + raise + tf_path = os.path.abspath(tf_checkpoint_path) + logger.info(f"Converting TensorFlow checkpoint from {tf_path}") + # Load weights from TF model + init_vars = tf.train.list_variables(tf_path) + names = [] + arrays = [] + for name, shape in init_vars: + logger.info(f"Loading TF weight {name} with shape {shape}") + array = tf.train.load_variable(tf_path, name) + names.append(name) + arrays.append(array) + for name, array in zip(names, arrays): + original_name: str = name + + try: + if isinstance(model, ElectraForMaskedLM): + name = name.replace("electra/embeddings/", "generator/embeddings/") + + if discriminator_or_generator == "generator": + name = name.replace("electra/", "discriminator/") + name = name.replace("generator/", "electra/") + + name = name.replace("dense_1", "dense_prediction") + name = name.replace("generator_predictions/output_bias", "generator_lm_head/bias") + + name = name.split("/") + # print(original_name, name) + # adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v + # which are not required for using pretrained model + if any(n in ["global_step", "temperature"] for n in name): + logger.info(f"Skipping {original_name}") + continue + pointer = model + for m_name in name: + if re.fullmatch(r"[A-Za-z]+_\d+", m_name): + scope_names = re.split(r"_(\d+)", m_name) + else: + scope_names = [m_name] + if scope_names[0] == "kernel" or scope_names[0] == "gamma": + pointer = getattr(pointer, "weight") + elif scope_names[0] == "output_bias" or scope_names[0] == "beta": + pointer = getattr(pointer, "bias") + elif scope_names[0] == "output_weights": + pointer = getattr(pointer, "weight") + elif scope_names[0] == "squad": + pointer = getattr(pointer, "classifier") + else: + pointer = getattr(pointer, scope_names[0]) + if len(scope_names) >= 2: + num = int(scope_names[1]) + pointer = pointer[num] + if m_name.endswith("_embeddings"): + pointer = getattr(pointer, "weight") + elif m_name == "kernel": + array = np.transpose(array) + try: + if pointer.shape != array.shape: + raise ValueError(f"Pointer shape {pointer.shape} and array shape {array.shape} mismatched") + except ValueError as e: + e.args += (pointer.shape, array.shape) + raise + print(f"Initialize PyTorch weight {name}", original_name) + pointer.data = torch.from_numpy(array) + except AttributeError as e: + print(f"Skipping {original_name}", name, e) + continue + return model + + +class ElectraEmbeddings(nn.Module): + """Construct the embeddings from word, position and token_type embeddings.""" + + def __init__(self, config): + super().__init__() + self.word_embeddings = nn.Embedding(config.vocab_size, config.embedding_size, padding_idx=config.pad_token_id) + self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.embedding_size) + self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.embedding_size) + + # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load + # any TensorFlow checkpoint file + self.LayerNorm = nn.LayerNorm(config.embedding_size, eps=config.layer_norm_eps) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + # position_ids (1, len position emb) is contiguous in memory and exported when serialized + self.register_buffer( + "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False + ) + self.position_embedding_type = getattr(config, "position_embedding_type", "absolute") + self.register_buffer( + "token_type_ids", torch.zeros(self.position_ids.size(), dtype=torch.long), persistent=False + ) + + # Copied from transformers.models.bert.modeling_bert.BertEmbeddings.forward + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + token_type_ids: Optional[torch.LongTensor] = None, + position_ids: Optional[torch.LongTensor] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + past_key_values_length: int = 0, + ) -> torch.Tensor: + if input_ids is not None: + input_shape = input_ids.size() + else: + input_shape = inputs_embeds.size()[:-1] + + seq_length = input_shape[1] + + if position_ids is None: + position_ids = self.position_ids[:, past_key_values_length : seq_length + past_key_values_length] + + # Setting the token_type_ids to the registered buffer in constructor where it is all zeros, which usually occurs + # when its auto-generated, registered buffer helps users when tracing the model without passing token_type_ids, solves + # issue #5664 + if token_type_ids is None: + if hasattr(self, "token_type_ids"): + buffered_token_type_ids = self.token_type_ids[:, :seq_length] + buffered_token_type_ids_expanded = buffered_token_type_ids.expand(input_shape[0], seq_length) + token_type_ids = buffered_token_type_ids_expanded + else: + token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=self.position_ids.device) + + if inputs_embeds is None: + inputs_embeds = self.word_embeddings(input_ids) + token_type_embeddings = self.token_type_embeddings(token_type_ids) + + embeddings = inputs_embeds + token_type_embeddings + if self.position_embedding_type == "absolute": + position_embeddings = self.position_embeddings(position_ids) + embeddings += position_embeddings + embeddings = self.LayerNorm(embeddings) + embeddings = self.dropout(embeddings) + return embeddings + + +# Copied from transformers.models.bert.modeling_bert.BertSelfAttention with Bert->Electra +class ElectraSelfAttention(nn.Module): + def __init__(self, config, position_embedding_type=None): + super().__init__() + if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"): + raise ValueError( + f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention " + f"heads ({config.num_attention_heads})" + ) + + self.num_attention_heads = config.num_attention_heads + self.attention_head_size = int(config.hidden_size / config.num_attention_heads) + self.all_head_size = self.num_attention_heads * self.attention_head_size + + self.query = nn.Linear(config.hidden_size, self.all_head_size) + self.key = nn.Linear(config.hidden_size, self.all_head_size) + self.value = nn.Linear(config.hidden_size, self.all_head_size) + + self.dropout = nn.Dropout(config.attention_probs_dropout_prob) + self.position_embedding_type = position_embedding_type or getattr( + config, "position_embedding_type", "absolute" + ) + if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query": + self.max_position_embeddings = config.max_position_embeddings + self.distance_embedding = nn.Embedding(2 * config.max_position_embeddings - 1, self.attention_head_size) + + self.is_decoder = config.is_decoder + + def transpose_for_scores(self, x: torch.Tensor) -> torch.Tensor: + new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size) + x = x.view(new_x_shape) + return x.permute(0, 2, 1, 3) + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.FloatTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + encoder_hidden_states: Optional[torch.FloatTensor] = None, + encoder_attention_mask: Optional[torch.FloatTensor] = None, + past_key_value: Optional[tuple[tuple[torch.FloatTensor]]] = None, + output_attentions: Optional[bool] = False, + ) -> tuple[torch.Tensor]: + mixed_query_layer = self.query(hidden_states) + + # If this is instantiated as a cross-attention module, the keys + # and values come from an encoder; the attention mask needs to be + # such that the encoder's padding tokens are not attended to. + is_cross_attention = encoder_hidden_states is not None + + if is_cross_attention and past_key_value is not None: + # reuse k,v, cross_attentions + key_layer = past_key_value[0] + value_layer = past_key_value[1] + attention_mask = encoder_attention_mask + elif is_cross_attention: + key_layer = self.transpose_for_scores(self.key(encoder_hidden_states)) + value_layer = self.transpose_for_scores(self.value(encoder_hidden_states)) + attention_mask = encoder_attention_mask + elif past_key_value is not None: + key_layer = self.transpose_for_scores(self.key(hidden_states)) + value_layer = self.transpose_for_scores(self.value(hidden_states)) + key_layer = torch.cat([past_key_value[0], key_layer], dim=2) + value_layer = torch.cat([past_key_value[1], value_layer], dim=2) + else: + key_layer = self.transpose_for_scores(self.key(hidden_states)) + value_layer = self.transpose_for_scores(self.value(hidden_states)) + + query_layer = self.transpose_for_scores(mixed_query_layer) + + use_cache = past_key_value is not None + if self.is_decoder: + # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states. + # Further calls to cross_attention layer can then reuse all cross-attention + # key/value_states (first "if" case) + # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of + # all previous decoder key/value_states. Further calls to uni-directional self-attention + # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case) + # if encoder bi-directional self-attention `past_key_value` is always `None` + past_key_value = (key_layer, value_layer) + + # Take the dot product between "query" and "key" to get the raw attention scores. + attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2)) + + if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query": + query_length, key_length = query_layer.shape[2], key_layer.shape[2] + if use_cache: + position_ids_l = torch.tensor(key_length - 1, dtype=torch.long, device=hidden_states.device).view( + -1, 1 + ) + else: + position_ids_l = torch.arange(query_length, dtype=torch.long, device=hidden_states.device).view(-1, 1) + position_ids_r = torch.arange(key_length, dtype=torch.long, device=hidden_states.device).view(1, -1) + distance = position_ids_l - position_ids_r + + positional_embedding = self.distance_embedding(distance + self.max_position_embeddings - 1) + positional_embedding = positional_embedding.to(dtype=query_layer.dtype) # fp16 compatibility + + if self.position_embedding_type == "relative_key": + relative_position_scores = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding) + attention_scores = attention_scores + relative_position_scores + elif self.position_embedding_type == "relative_key_query": + relative_position_scores_query = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding) + relative_position_scores_key = torch.einsum("bhrd,lrd->bhlr", key_layer, positional_embedding) + attention_scores = attention_scores + relative_position_scores_query + relative_position_scores_key + + attention_scores = attention_scores / math.sqrt(self.attention_head_size) + if attention_mask is not None: + # Apply the attention mask is (precomputed for all layers in ElectraModel forward() function) + attention_scores = attention_scores + attention_mask + + # Normalize the attention scores to probabilities. + attention_probs = nn.functional.softmax(attention_scores, dim=-1) + + # This is actually dropping out entire tokens to attend to, which might + # seem a bit unusual, but is taken from the original Transformer paper. + attention_probs = self.dropout(attention_probs) + + # Mask heads if we want to + if head_mask is not None: + attention_probs = attention_probs * head_mask + + context_layer = torch.matmul(attention_probs, value_layer) + + context_layer = context_layer.permute(0, 2, 1, 3).contiguous() + new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,) + context_layer = context_layer.view(new_context_layer_shape) + + outputs = (context_layer, attention_probs) if output_attentions else (context_layer,) + + if self.is_decoder: + outputs = outputs + (past_key_value,) + return outputs + + +# Copied from transformers.models.bert.modeling_bert.BertSelfOutput +class ElectraSelfOutput(nn.Module): + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.hidden_size) + self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor: + hidden_states = self.dense(hidden_states) + hidden_states = self.dropout(hidden_states) + hidden_states = self.LayerNorm(hidden_states + input_tensor) + return hidden_states + + +ELECTRA_SELF_ATTENTION_CLASSES = { + "eager": ElectraSelfAttention, +} + + +# Copied from transformers.models.bert.modeling_bert.BertAttention with Bert->Electra,BERT->ELECTRA +class ElectraAttention(nn.Module): + def __init__(self, config, position_embedding_type=None): + super().__init__() + self.self = ELECTRA_SELF_ATTENTION_CLASSES[config._attn_implementation]( + config, position_embedding_type=position_embedding_type + ) + self.output = ElectraSelfOutput(config) + self.pruned_heads = set() + + def prune_heads(self, heads): + if len(heads) == 0: + return + heads, index = find_pruneable_heads_and_indices( + heads, self.self.num_attention_heads, self.self.attention_head_size, self.pruned_heads + ) + + # Prune linear layers + self.self.query = prune_linear_layer(self.self.query, index) + self.self.key = prune_linear_layer(self.self.key, index) + self.self.value = prune_linear_layer(self.self.value, index) + self.output.dense = prune_linear_layer(self.output.dense, index, dim=1) + + # Update hyper params and store pruned heads + self.self.num_attention_heads = self.self.num_attention_heads - len(heads) + self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads + self.pruned_heads = self.pruned_heads.union(heads) + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.FloatTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + encoder_hidden_states: Optional[torch.FloatTensor] = None, + encoder_attention_mask: Optional[torch.FloatTensor] = None, + past_key_value: Optional[tuple[tuple[torch.FloatTensor]]] = None, + output_attentions: Optional[bool] = False, + ) -> tuple[torch.Tensor]: + self_outputs = self.self( + hidden_states, + attention_mask, + head_mask, + encoder_hidden_states, + encoder_attention_mask, + past_key_value, + output_attentions, + ) + attention_output = self.output(self_outputs[0], hidden_states) + outputs = (attention_output,) + self_outputs[1:] # add attentions if we output them + return outputs + + +# Copied from transformers.models.bert.modeling_bert.BertIntermediate +class ElectraIntermediate(nn.Module): + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.intermediate_size) + if isinstance(config.hidden_act, str): + self.intermediate_act_fn = ACT2FN[config.hidden_act] + else: + self.intermediate_act_fn = config.hidden_act + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + hidden_states = self.dense(hidden_states) + hidden_states = self.intermediate_act_fn(hidden_states) + return hidden_states + + +# Copied from transformers.models.bert.modeling_bert.BertOutput +class ElectraOutput(nn.Module): + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.intermediate_size, config.hidden_size) + self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor: + hidden_states = self.dense(hidden_states) + hidden_states = self.dropout(hidden_states) + hidden_states = self.LayerNorm(hidden_states + input_tensor) + return hidden_states + + +# Copied from transformers.models.bert.modeling_bert.BertLayer with Bert->Electra +class ElectraLayer(GradientCheckpointingLayer): + def __init__(self, config): + super().__init__() + self.chunk_size_feed_forward = config.chunk_size_feed_forward + self.seq_len_dim = 1 + self.attention = ElectraAttention(config) + self.is_decoder = config.is_decoder + self.add_cross_attention = config.add_cross_attention + if self.add_cross_attention: + if not self.is_decoder: + raise ValueError(f"{self} should be used as a decoder model if cross attention is added") + self.crossattention = ElectraAttention(config, position_embedding_type="absolute") + self.intermediate = ElectraIntermediate(config) + self.output = ElectraOutput(config) + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.FloatTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + encoder_hidden_states: Optional[torch.FloatTensor] = None, + encoder_attention_mask: Optional[torch.FloatTensor] = None, + past_key_value: Optional[tuple[tuple[torch.FloatTensor]]] = None, + output_attentions: Optional[bool] = False, + ) -> tuple[torch.Tensor]: + # decoder uni-directional self-attention cached key/values tuple is at positions 1,2 + self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None + self_attention_outputs = self.attention( + hidden_states, + attention_mask, + head_mask, + output_attentions=output_attentions, + past_key_value=self_attn_past_key_value, + ) + attention_output = self_attention_outputs[0] + + # if decoder, the last output is tuple of self-attn cache + if self.is_decoder: + outputs = self_attention_outputs[1:-1] + present_key_value = self_attention_outputs[-1] + else: + outputs = self_attention_outputs[1:] # add self attentions if we output attention weights + + cross_attn_present_key_value = None + if self.is_decoder and encoder_hidden_states is not None: + if not hasattr(self, "crossattention"): + raise ValueError( + f"If `encoder_hidden_states` are passed, {self} has to be instantiated with cross-attention layers" + " by setting `config.add_cross_attention=True`" + ) + + # cross_attn cached key/values tuple is at positions 3,4 of past_key_value tuple + cross_attn_past_key_value = past_key_value[-2:] if past_key_value is not None else None + cross_attention_outputs = self.crossattention( + attention_output, + attention_mask, + head_mask, + encoder_hidden_states, + encoder_attention_mask, + cross_attn_past_key_value, + output_attentions, + ) + attention_output = cross_attention_outputs[0] + outputs = outputs + cross_attention_outputs[1:-1] # add cross attentions if we output attention weights + + # add cross-attn cache to positions 3,4 of present_key_value tuple + cross_attn_present_key_value = cross_attention_outputs[-1] + present_key_value = present_key_value + cross_attn_present_key_value + + layer_output = apply_chunking_to_forward( + self.feed_forward_chunk, self.chunk_size_feed_forward, self.seq_len_dim, attention_output + ) + outputs = (layer_output,) + outputs + + # if decoder, return the attn key/values as the last output + if self.is_decoder: + outputs = outputs + (present_key_value,) + + return outputs + + def feed_forward_chunk(self, attention_output): + intermediate_output = self.intermediate(attention_output) + layer_output = self.output(intermediate_output, attention_output) + return layer_output + + +# Copied from transformers.models.bert.modeling_bert.BertEncoder with Bert->Electra +class ElectraEncoder(nn.Module): + def __init__(self, config): + super().__init__() + self.config = config + self.layer = nn.ModuleList([ElectraLayer(config) for _ in range(config.num_hidden_layers)]) + self.gradient_checkpointing = False + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.FloatTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + encoder_hidden_states: Optional[torch.FloatTensor] = None, + encoder_attention_mask: Optional[torch.FloatTensor] = None, + past_key_values: Optional[tuple[tuple[torch.FloatTensor]]] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = False, + output_hidden_states: Optional[bool] = False, + return_dict: Optional[bool] = True, + ) -> Union[tuple[torch.Tensor], BaseModelOutputWithPastAndCrossAttentions]: + all_hidden_states = () if output_hidden_states else None + all_self_attentions = () if output_attentions else None + all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None + + if self.gradient_checkpointing and self.training: + if use_cache: + logger.warning_once( + "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..." + ) + use_cache = False + + next_decoder_cache = () if use_cache else None + for i, layer_module in enumerate(self.layer): + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + layer_head_mask = head_mask[i] if head_mask is not None else None + past_key_value = past_key_values[i] if past_key_values is not None else None + + layer_outputs = layer_module( + hidden_states, + attention_mask, + layer_head_mask, + encoder_hidden_states, # as a positional argument for gradient checkpointing + encoder_attention_mask=encoder_attention_mask, + past_key_value=past_key_value, + output_attentions=output_attentions, + ) + + hidden_states = layer_outputs[0] + if use_cache: + next_decoder_cache += (layer_outputs[-1],) + if output_attentions: + all_self_attentions = all_self_attentions + (layer_outputs[1],) + if self.config.add_cross_attention: + all_cross_attentions = all_cross_attentions + (layer_outputs[2],) + + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + if not return_dict: + return tuple( + v + for v in [ + hidden_states, + next_decoder_cache, + all_hidden_states, + all_self_attentions, + all_cross_attentions, + ] + if v is not None + ) + return BaseModelOutputWithPastAndCrossAttentions( + last_hidden_state=hidden_states, + past_key_values=next_decoder_cache, + hidden_states=all_hidden_states, + attentions=all_self_attentions, + cross_attentions=all_cross_attentions, + ) + + +class ElectraDiscriminatorPredictions(nn.Module): + """Prediction module for the discriminator, made up of two dense layers.""" + + def __init__(self, config): + super().__init__() + + self.dense = nn.Linear(config.hidden_size, config.hidden_size) + self.activation = get_activation(config.hidden_act) + self.dense_prediction = nn.Linear(config.hidden_size, 1) + self.config = config + + def forward(self, discriminator_hidden_states): + hidden_states = self.dense(discriminator_hidden_states) + hidden_states = self.activation(hidden_states) + logits = self.dense_prediction(hidden_states).squeeze(-1) + + return logits + + +class ElectraGeneratorPredictions(nn.Module): + """Prediction module for the generator, made up of two dense layers.""" + + def __init__(self, config): + super().__init__() + + self.activation = get_activation("gelu") + self.LayerNorm = nn.LayerNorm(config.embedding_size, eps=config.layer_norm_eps) + self.dense = nn.Linear(config.hidden_size, config.embedding_size) + + def forward(self, generator_hidden_states): + hidden_states = self.dense(generator_hidden_states) + hidden_states = self.activation(hidden_states) + hidden_states = self.LayerNorm(hidden_states) + + return hidden_states + + +@auto_docstring +class ElectraPreTrainedModel(PreTrainedModel): + config_class = ElectraConfig + load_tf_weights = load_tf_weights_in_electra + base_model_prefix = "electra" + supports_gradient_checkpointing = True + + def _init_weights(self, module): + """Initialize the weights""" + if isinstance(module, nn.Linear): + # Slightly different from the TF version which uses truncated_normal for initialization + # cf https://github.com/pytorch/pytorch/pull/5617 + module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.Embedding): + module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + if module.padding_idx is not None: + module.weight.data[module.padding_idx].zero_() + elif isinstance(module, nn.LayerNorm): + module.bias.data.zero_() + module.weight.data.fill_(1.0) + + +@dataclass +@auto_docstring( + custom_intro=""" + Output type of [`ElectraForPreTraining`]. + """ +) +class ElectraForPreTrainingOutput(ModelOutput): + r""" + loss (*optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`): + Total loss of the ELECTRA objective. + logits (`torch.FloatTensor` of shape `(batch_size, sequence_length)`): + Prediction scores of the head (scores for each token before SoftMax). + """ + + loss: Optional[torch.FloatTensor] = None + logits: Optional[torch.FloatTensor] = None + hidden_states: Optional[tuple[torch.FloatTensor]] = None + attentions: Optional[tuple[torch.FloatTensor]] = None + + +@auto_docstring +class ElectraModel(ElectraPreTrainedModel): + def __init__(self, config): + super().__init__(config) + self.embeddings = ElectraEmbeddings(config) + + if config.embedding_size != config.hidden_size: + self.embeddings_project = nn.Linear(config.embedding_size, config.hidden_size) + + self.encoder = ElectraEncoder(config) + self.config = config + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.embeddings.word_embeddings + + def set_input_embeddings(self, value): + self.embeddings.word_embeddings = value + + def _prune_heads(self, heads_to_prune): + """ + Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base + class PreTrainedModel + """ + for layer, heads in heads_to_prune.items(): + self.encoder.layer[layer].attention.prune_heads(heads) + + @auto_docstring + def forward( + self, + input_ids: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + token_type_ids: Optional[torch.Tensor] = None, + position_ids: Optional[torch.Tensor] = None, + head_mask: Optional[torch.Tensor] = None, + inputs_embeds: Optional[torch.Tensor] = None, + encoder_hidden_states: Optional[torch.Tensor] = None, + encoder_attention_mask: Optional[torch.Tensor] = None, + past_key_values: Optional[list[torch.FloatTensor]] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[tuple[torch.Tensor], BaseModelOutputWithCrossAttentions]: + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if input_ids is not None and inputs_embeds is not None: + raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") + elif input_ids is not None: + self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask) + input_shape = input_ids.size() + elif inputs_embeds is not None: + input_shape = inputs_embeds.size()[:-1] + else: + raise ValueError("You have to specify either input_ids or inputs_embeds") + + batch_size, seq_length = input_shape + device = input_ids.device if input_ids is not None else inputs_embeds.device + + # past_key_values_length + past_key_values_length = past_key_values[0][0].shape[2] if past_key_values is not None else 0 + + if attention_mask is None: + attention_mask = torch.ones(input_shape, device=device) + if token_type_ids is None: + if hasattr(self.embeddings, "token_type_ids"): + buffered_token_type_ids = self.embeddings.token_type_ids[:, :seq_length] + buffered_token_type_ids_expanded = buffered_token_type_ids.expand(batch_size, seq_length) + token_type_ids = buffered_token_type_ids_expanded + else: + token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device) + + extended_attention_mask = self.get_extended_attention_mask(attention_mask, input_shape) + + # If a 2D or 3D attention mask is provided for the cross-attention + # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length] + if self.config.is_decoder and encoder_hidden_states is not None: + encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size() + encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length) + if encoder_attention_mask is None: + encoder_attention_mask = torch.ones(encoder_hidden_shape, device=device) + encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask) + else: + encoder_extended_attention_mask = None + + head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers) + + hidden_states = self.embeddings( + input_ids=input_ids, + position_ids=position_ids, + token_type_ids=token_type_ids, + inputs_embeds=inputs_embeds, + past_key_values_length=past_key_values_length, + ) + + if hasattr(self, "embeddings_project"): + hidden_states = self.embeddings_project(hidden_states) + + hidden_states = self.encoder( + hidden_states, + attention_mask=extended_attention_mask, + head_mask=head_mask, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_extended_attention_mask, + past_key_values=past_key_values, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + return hidden_states + + +class ElectraClassificationHead(nn.Module): + """Head for sentence-level classification tasks.""" + + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.hidden_size) + classifier_dropout = ( + config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob + ) + self.activation = get_activation("gelu") + self.dropout = nn.Dropout(classifier_dropout) + self.out_proj = nn.Linear(config.hidden_size, config.num_labels) + + def forward(self, features, **kwargs): + x = features[:, 0, :] # take token (equiv. to [CLS]) + x = self.dropout(x) + x = self.dense(x) + x = self.activation(x) # although BERT uses tanh here, it seems Electra authors used gelu here + x = self.dropout(x) + x = self.out_proj(x) + return x + + +# Copied from transformers.models.xlm.modeling_xlm.XLMSequenceSummary with XLM->Electra +class ElectraSequenceSummary(nn.Module): + r""" + Compute a single vector summary of a sequence hidden states. + + Args: + config ([`ElectraConfig`]): + The config used by the model. Relevant arguments in the config class of the model are (refer to the actual + config class of your model for the default values it uses): + + - **summary_type** (`str`) -- The method to use to make this summary. Accepted values are: + + - `"last"` -- Take the last token hidden state (like XLNet) + - `"first"` -- Take the first token hidden state (like Bert) + - `"mean"` -- Take the mean of all tokens hidden states + - `"cls_index"` -- Supply a Tensor of classification token position (GPT/GPT-2) + - `"attn"` -- Not implemented now, use multi-head attention + + - **summary_use_proj** (`bool`) -- Add a projection after the vector extraction. + - **summary_proj_to_labels** (`bool`) -- If `True`, the projection outputs to `config.num_labels` classes + (otherwise to `config.hidden_size`). + - **summary_activation** (`Optional[str]`) -- Set to `"tanh"` to add a tanh activation to the output, + another string or `None` will add no activation. + - **summary_first_dropout** (`float`) -- Optional dropout probability before the projection and activation. + - **summary_last_dropout** (`float`)-- Optional dropout probability after the projection and activation. + """ + + def __init__(self, config: ElectraConfig): + super().__init__() + + self.summary_type = getattr(config, "summary_type", "last") + if self.summary_type == "attn": + # We should use a standard multi-head attention module with absolute positional embedding for that. + # Cf. https://github.com/zihangdai/xlnet/blob/master/modeling.py#L253-L276 + # We can probably just use the multi-head attention module of PyTorch >=1.1.0 + raise NotImplementedError + + self.summary = nn.Identity() + if hasattr(config, "summary_use_proj") and config.summary_use_proj: + if hasattr(config, "summary_proj_to_labels") and config.summary_proj_to_labels and config.num_labels > 0: + num_classes = config.num_labels + else: + num_classes = config.hidden_size + self.summary = nn.Linear(config.hidden_size, num_classes) + + activation_string = getattr(config, "summary_activation", None) + self.activation: Callable = get_activation(activation_string) if activation_string else nn.Identity() + + self.first_dropout = nn.Identity() + if hasattr(config, "summary_first_dropout") and config.summary_first_dropout > 0: + self.first_dropout = nn.Dropout(config.summary_first_dropout) + + self.last_dropout = nn.Identity() + if hasattr(config, "summary_last_dropout") and config.summary_last_dropout > 0: + self.last_dropout = nn.Dropout(config.summary_last_dropout) + + def forward( + self, hidden_states: torch.FloatTensor, cls_index: Optional[torch.LongTensor] = None + ) -> torch.FloatTensor: + """ + Compute a single vector summary of a sequence hidden states. + + Args: + hidden_states (`torch.FloatTensor` of shape `[batch_size, seq_len, hidden_size]`): + The hidden states of the last layer. + cls_index (`torch.LongTensor` of shape `[batch_size]` or `[batch_size, ...]` where ... are optional leading dimensions of `hidden_states`, *optional*): + Used if `summary_type == "cls_index"` and takes the last token of the sequence as classification token. + + Returns: + `torch.FloatTensor`: The summary of the sequence hidden states. + """ + if self.summary_type == "last": + output = hidden_states[:, -1] + elif self.summary_type == "first": + output = hidden_states[:, 0] + elif self.summary_type == "mean": + output = hidden_states.mean(dim=1) + elif self.summary_type == "cls_index": + if cls_index is None: + cls_index = torch.full_like( + hidden_states[..., :1, :], + hidden_states.shape[-2] - 1, + dtype=torch.long, + ) + else: + cls_index = cls_index.unsqueeze(-1).unsqueeze(-1) + cls_index = cls_index.expand((-1,) * (cls_index.dim() - 1) + (hidden_states.size(-1),)) + # shape of cls_index: (bsz, XX, 1, hidden_size) where XX are optional leading dim of hidden_states + output = hidden_states.gather(-2, cls_index).squeeze(-2) # shape (bsz, XX, hidden_size) + elif self.summary_type == "attn": + raise NotImplementedError + + output = self.first_dropout(output) + output = self.summary(output) + output = self.activation(output) + output = self.last_dropout(output) + + return output + + +@auto_docstring( + custom_intro=""" + ELECTRA Model transformer with a sequence classification/regression head on top (a linear layer on top of the + pooled output) e.g. for GLUE tasks. + """ +) +class ElectraForSequenceClassification(ElectraPreTrainedModel): + def __init__(self, config): + super().__init__(config) + self.num_labels = config.num_labels + self.config = config + self.electra = ElectraModel(config) + self.classifier = ElectraClassificationHead(config) + + # Initialize weights and apply final processing + self.post_init() + + @auto_docstring + def forward( + self, + input_ids: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + token_type_ids: Optional[torch.Tensor] = None, + position_ids: Optional[torch.Tensor] = None, + head_mask: Optional[torch.Tensor] = None, + inputs_embeds: Optional[torch.Tensor] = None, + labels: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[tuple[torch.Tensor], SequenceClassifierOutput]: + r""" + labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., + config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If + `config.num_labels > 1` a classification loss is computed (Cross-Entropy). + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + discriminator_hidden_states = self.electra( + input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + sequence_output = discriminator_hidden_states[0] + logits = self.classifier(sequence_output) + + loss = None + if labels is not None: + if self.config.problem_type is None: + if self.num_labels == 1: + self.config.problem_type = "regression" + elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int): + self.config.problem_type = "single_label_classification" + else: + self.config.problem_type = "multi_label_classification" + + if self.config.problem_type == "regression": + loss_fct = MSELoss() + if self.num_labels == 1: + loss = loss_fct(logits.squeeze(), labels.squeeze()) + else: + loss = loss_fct(logits, labels) + elif self.config.problem_type == "single_label_classification": + loss_fct = CrossEntropyLoss() + loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) + elif self.config.problem_type == "multi_label_classification": + loss_fct = BCEWithLogitsLoss() + loss = loss_fct(logits, labels) + + if not return_dict: + output = (logits,) + discriminator_hidden_states[1:] + return ((loss,) + output) if loss is not None else output + + return SequenceClassifierOutput( + loss=loss, + logits=logits, + hidden_states=discriminator_hidden_states.hidden_states, + attentions=discriminator_hidden_states.attentions, + ) + + +@auto_docstring( + custom_intro=""" + Electra model with a binary classification head on top as used during pretraining for identifying generated tokens. + + It is recommended to load the discriminator checkpoint into that model. + """ +) +class ElectraForPreTraining(ElectraPreTrainedModel): + def __init__(self, config): + super().__init__(config) + + self.electra = ElectraModel(config) + self.discriminator_predictions = ElectraDiscriminatorPredictions(config) + # Initialize weights and apply final processing + self.post_init() + + @auto_docstring + def forward( + self, + input_ids: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + token_type_ids: Optional[torch.Tensor] = None, + position_ids: Optional[torch.Tensor] = None, + head_mask: Optional[torch.Tensor] = None, + inputs_embeds: Optional[torch.Tensor] = None, + labels: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[tuple[torch.Tensor], ElectraForPreTrainingOutput]: + r""" + labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the ELECTRA loss. Input should be a sequence of tokens (see `input_ids` docstring) + Indices should be in `[0, 1]`: + + - 0 indicates the token is an original token, + - 1 indicates the token was replaced. + + Examples: + + ```python + >>> from transformers import ElectraForPreTraining, AutoTokenizer + >>> import torch + + >>> discriminator = ElectraForPreTraining.from_pretrained("google/electra-base-discriminator") + >>> tokenizer = AutoTokenizer.from_pretrained("google/electra-base-discriminator") + + >>> sentence = "The quick brown fox jumps over the lazy dog" + >>> fake_sentence = "The quick brown fox fake over the lazy dog" + + >>> fake_tokens = tokenizer.tokenize(fake_sentence, add_special_tokens=True) + >>> fake_inputs = tokenizer.encode(fake_sentence, return_tensors="pt") + >>> discriminator_outputs = discriminator(fake_inputs) + >>> predictions = torch.round((torch.sign(discriminator_outputs[0]) + 1) / 2) + + >>> fake_tokens + ['[CLS]', 'the', 'quick', 'brown', 'fox', 'fake', 'over', 'the', 'lazy', 'dog', '[SEP]'] + + >>> predictions.squeeze().tolist() + [0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0] + ```""" + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + discriminator_hidden_states = self.electra( + input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + discriminator_sequence_output = discriminator_hidden_states[0] + + logits = self.discriminator_predictions(discriminator_sequence_output) + + loss = None + if labels is not None: + loss_fct = nn.BCEWithLogitsLoss() + if attention_mask is not None: + active_loss = attention_mask.view(-1, discriminator_sequence_output.shape[1]) == 1 + active_logits = logits.view(-1, discriminator_sequence_output.shape[1])[active_loss] + active_labels = labels[active_loss] + loss = loss_fct(active_logits, active_labels.float()) + else: + loss = loss_fct(logits.view(-1, discriminator_sequence_output.shape[1]), labels.float()) + + if not return_dict: + output = (logits,) + discriminator_hidden_states[1:] + return ((loss,) + output) if loss is not None else output + + return ElectraForPreTrainingOutput( + loss=loss, + logits=logits, + hidden_states=discriminator_hidden_states.hidden_states, + attentions=discriminator_hidden_states.attentions, + ) + + +@auto_docstring( + custom_intro=""" + Electra model with a language modeling head on top. + + Even though both the discriminator and generator may be loaded into this model, the generator is the only model of + the two to have been trained for the masked language modeling task. + """ +) +class ElectraForMaskedLM(ElectraPreTrainedModel): + _tied_weights_keys = ["generator_lm_head.weight"] + + def __init__(self, config): + super().__init__(config) + + self.electra = ElectraModel(config) + self.generator_predictions = ElectraGeneratorPredictions(config) + + self.generator_lm_head = nn.Linear(config.embedding_size, config.vocab_size) + # Initialize weights and apply final processing + self.post_init() + + def get_output_embeddings(self): + return self.generator_lm_head + + def set_output_embeddings(self, word_embeddings): + self.generator_lm_head = word_embeddings + + @auto_docstring + def forward( + self, + input_ids: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + token_type_ids: Optional[torch.Tensor] = None, + position_ids: Optional[torch.Tensor] = None, + head_mask: Optional[torch.Tensor] = None, + inputs_embeds: Optional[torch.Tensor] = None, + labels: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[tuple[torch.Tensor], MaskedLMOutput]: + r""" + labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ..., + config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the + loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]` + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + generator_hidden_states = self.electra( + input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + generator_sequence_output = generator_hidden_states[0] + + prediction_scores = self.generator_predictions(generator_sequence_output) + prediction_scores = self.generator_lm_head(prediction_scores) + + loss = None + # Masked language modeling softmax layer + if labels is not None: + loss_fct = nn.CrossEntropyLoss() # -100 index = padding token + loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1)) + + if not return_dict: + output = (prediction_scores,) + generator_hidden_states[1:] + return ((loss,) + output) if loss is not None else output + + return MaskedLMOutput( + loss=loss, + logits=prediction_scores, + hidden_states=generator_hidden_states.hidden_states, + attentions=generator_hidden_states.attentions, + ) + + +@auto_docstring( + custom_intro=""" + Electra model with a token classification head on top. + + Both the discriminator and generator may be loaded into this model. + """ +) +class ElectraForTokenClassification(ElectraPreTrainedModel): + def __init__(self, config): + super().__init__(config) + self.num_labels = config.num_labels + + self.electra = ElectraModel(config) + classifier_dropout = ( + config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob + ) + self.dropout = nn.Dropout(classifier_dropout) + self.classifier = nn.Linear(config.hidden_size, config.num_labels) + # Initialize weights and apply final processing + self.post_init() + + @auto_docstring + def forward( + self, + input_ids: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + token_type_ids: Optional[torch.Tensor] = None, + position_ids: Optional[torch.Tensor] = None, + head_mask: Optional[torch.Tensor] = None, + inputs_embeds: Optional[torch.Tensor] = None, + labels: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[tuple[torch.Tensor], TokenClassifierOutput]: + r""" + labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`. + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + discriminator_hidden_states = self.electra( + input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + discriminator_sequence_output = discriminator_hidden_states[0] + + discriminator_sequence_output = self.dropout(discriminator_sequence_output) + logits = self.classifier(discriminator_sequence_output) + + loss = None + if labels is not None: + loss_fct = CrossEntropyLoss() + loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) + + if not return_dict: + output = (logits,) + discriminator_hidden_states[1:] + return ((loss,) + output) if loss is not None else output + + return TokenClassifierOutput( + loss=loss, + logits=logits, + hidden_states=discriminator_hidden_states.hidden_states, + attentions=discriminator_hidden_states.attentions, + ) + + +@auto_docstring +class ElectraForQuestionAnswering(ElectraPreTrainedModel): + config_class = ElectraConfig + base_model_prefix = "electra" + + def __init__(self, config): + super().__init__(config) + self.num_labels = config.num_labels + + self.electra = ElectraModel(config) + self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels) + + # Initialize weights and apply final processing + self.post_init() + + @auto_docstring + def forward( + self, + input_ids: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + token_type_ids: Optional[torch.Tensor] = None, + position_ids: Optional[torch.Tensor] = None, + head_mask: Optional[torch.Tensor] = None, + inputs_embeds: Optional[torch.Tensor] = None, + start_positions: Optional[torch.Tensor] = None, + end_positions: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[tuple[torch.Tensor], QuestionAnsweringModelOutput]: + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + discriminator_hidden_states = self.electra( + input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + ) + + sequence_output = discriminator_hidden_states[0] + + logits = self.qa_outputs(sequence_output) + start_logits, end_logits = logits.split(1, dim=-1) + start_logits = start_logits.squeeze(-1).contiguous() + end_logits = end_logits.squeeze(-1).contiguous() + + total_loss = None + if start_positions is not None and end_positions is not None: + # If we are on multi-GPU, split add a dimension + if len(start_positions.size()) > 1: + start_positions = start_positions.squeeze(-1) + if len(end_positions.size()) > 1: + end_positions = end_positions.squeeze(-1) + # sometimes the start/end positions are outside our model inputs, we ignore these terms + ignored_index = start_logits.size(1) + start_positions = start_positions.clamp(0, ignored_index) + end_positions = end_positions.clamp(0, ignored_index) + + loss_fct = CrossEntropyLoss(ignore_index=ignored_index) + start_loss = loss_fct(start_logits, start_positions) + end_loss = loss_fct(end_logits, end_positions) + total_loss = (start_loss + end_loss) / 2 + + if not return_dict: + output = ( + start_logits, + end_logits, + ) + discriminator_hidden_states[1:] + return ((total_loss,) + output) if total_loss is not None else output + + return QuestionAnsweringModelOutput( + loss=total_loss, + start_logits=start_logits, + end_logits=end_logits, + hidden_states=discriminator_hidden_states.hidden_states, + attentions=discriminator_hidden_states.attentions, + ) + + +@auto_docstring +class ElectraForMultipleChoice(ElectraPreTrainedModel): + def __init__(self, config): + super().__init__(config) + + self.electra = ElectraModel(config) + self.sequence_summary = ElectraSequenceSummary(config) + self.classifier = nn.Linear(config.hidden_size, 1) + + # Initialize weights and apply final processing + self.post_init() + + @auto_docstring + def forward( + self, + input_ids: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + token_type_ids: Optional[torch.Tensor] = None, + position_ids: Optional[torch.Tensor] = None, + head_mask: Optional[torch.Tensor] = None, + inputs_embeds: Optional[torch.Tensor] = None, + labels: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[tuple[torch.Tensor], MultipleChoiceModelOutput]: + r""" + input_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`): + Indices of input sequence tokens in the vocabulary. + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + [What are input IDs?](../glossary#input-ids) + token_type_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*): + Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0, + 1]`: + + - 0 corresponds to a *sentence A* token, + - 1 corresponds to a *sentence B* token. + + [What are token type IDs?](../glossary#token-type-ids) + position_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*): + Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, + config.max_position_embeddings - 1]`. + + [What are position IDs?](../glossary#position-ids) + inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_choices, sequence_length, hidden_size)`, *optional*): + Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This + is useful if you want more control over how to convert `input_ids` indices into associated vectors than the + model's internal embedding lookup matrix. + labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels for computing the multiple choice classification loss. Indices should be in `[0, ..., + num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See + `input_ids` above) + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1] + + input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None + attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None + token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) if token_type_ids is not None else None + position_ids = position_ids.view(-1, position_ids.size(-1)) if position_ids is not None else None + inputs_embeds = ( + inputs_embeds.view(-1, inputs_embeds.size(-2), inputs_embeds.size(-1)) + if inputs_embeds is not None + else None + ) + + discriminator_hidden_states = self.electra( + input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + sequence_output = discriminator_hidden_states[0] + + pooled_output = self.sequence_summary(sequence_output) + logits = self.classifier(pooled_output) + reshaped_logits = logits.view(-1, num_choices) + + loss = None + if labels is not None: + loss_fct = CrossEntropyLoss() + loss = loss_fct(reshaped_logits, labels) + + if not return_dict: + output = (reshaped_logits,) + discriminator_hidden_states[1:] + return ((loss,) + output) if loss is not None else output + + return MultipleChoiceModelOutput( + loss=loss, + logits=reshaped_logits, + hidden_states=discriminator_hidden_states.hidden_states, + attentions=discriminator_hidden_states.attentions, + ) + + +@auto_docstring( + custom_intro=""" + ELECTRA Model with a `language modeling` head on top for CLM fine-tuning. + """ +) +class ElectraForCausalLM(ElectraPreTrainedModel, GenerationMixin): + _tied_weights_keys = ["generator_lm_head.weight"] + + def __init__(self, config): + super().__init__(config) + + if not config.is_decoder: + logger.warning("If you want to use `ElectraForCausalLM` as a standalone, add `is_decoder=True.`") + + self.electra = ElectraModel(config) + self.generator_predictions = ElectraGeneratorPredictions(config) + self.generator_lm_head = nn.Linear(config.embedding_size, config.vocab_size) + + self.init_weights() + + def get_output_embeddings(self): + return self.generator_lm_head + + def set_output_embeddings(self, new_embeddings): + self.generator_lm_head = new_embeddings + + @auto_docstring + def forward( + self, + input_ids: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + token_type_ids: Optional[torch.Tensor] = None, + position_ids: Optional[torch.Tensor] = None, + head_mask: Optional[torch.Tensor] = None, + inputs_embeds: Optional[torch.Tensor] = None, + encoder_hidden_states: Optional[torch.Tensor] = None, + encoder_attention_mask: Optional[torch.Tensor] = None, + labels: Optional[torch.Tensor] = None, + past_key_values: Optional[list[torch.Tensor]] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + **kwargs, + ) -> Union[tuple[torch.Tensor], CausalLMOutputWithCrossAttentions]: + r""" + labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in + `[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are + ignored (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]` + + Example: + + ```python + >>> from transformers import AutoTokenizer, ElectraForCausalLM, ElectraConfig + >>> import torch + + >>> tokenizer = AutoTokenizer.from_pretrained("google/electra-base-generator") + >>> config = ElectraConfig.from_pretrained("google/electra-base-generator") + >>> config.is_decoder = True + >>> model = ElectraForCausalLM.from_pretrained("google/electra-base-generator", config=config) + + >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt") + >>> outputs = model(**inputs) + + >>> prediction_logits = outputs.logits + ```""" + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + if labels is not None: + use_cache = False + + outputs = self.electra( + input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_attention_mask, + past_key_values=past_key_values, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + sequence_output = outputs[0] + prediction_scores = self.generator_lm_head(self.generator_predictions(sequence_output)) + + lm_loss = None + if labels is not None: + lm_loss = self.loss_function( + prediction_scores, + labels, + vocab_size=self.config.vocab_size, + **kwargs, + ) + + if not return_dict: + output = (prediction_scores,) + outputs[1:] + return ((lm_loss,) + output) if lm_loss is not None else output + + return CausalLMOutputWithCrossAttentions( + loss=lm_loss, + logits=prediction_scores, + past_key_values=outputs.past_key_values, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + cross_attentions=outputs.cross_attentions, + ) + + # Copied from transformers.models.roberta.modeling_roberta.RobertaForCausalLM._reorder_cache + def _reorder_cache(self, past_key_values, beam_idx): + reordered_past = () + for layer_past in past_key_values: + reordered_past += ( + tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past), + ) + return reordered_past + + +__all__ = [ + "ElectraForCausalLM", + "ElectraForMaskedLM", + "ElectraForMultipleChoice", + "ElectraForPreTraining", + "ElectraForQuestionAnswering", + "ElectraForSequenceClassification", + "ElectraForTokenClassification", + "ElectraModel", + "ElectraPreTrainedModel", + "load_tf_weights_in_electra", +] diff --git a/transformers/src/transformers/models/electra/modeling_flax_electra.py b/transformers/src/transformers/models/electra/modeling_flax_electra.py new file mode 100644 index 0000000000000000000000000000000000000000..14d845476d62f9defb2de4392742037762fb959f --- /dev/null +++ b/transformers/src/transformers/models/electra/modeling_flax_electra.py @@ -0,0 +1,1614 @@ +# coding=utf-8 +# Copyright 2021 The Google Flax Team Authors and The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Callable, Optional + +import flax +import flax.linen as nn +import jax +import jax.numpy as jnp +import numpy as np +from flax.core.frozen_dict import FrozenDict, freeze, unfreeze +from flax.linen import combine_masks, make_causal_mask +from flax.linen import partitioning as nn_partitioning +from flax.linen.attention import dot_product_attention_weights +from flax.traverse_util import flatten_dict, unflatten_dict +from jax import lax + +from ...modeling_flax_outputs import ( + FlaxBaseModelOutput, + FlaxBaseModelOutputWithPastAndCrossAttentions, + FlaxCausalLMOutputWithCrossAttentions, + FlaxMaskedLMOutput, + FlaxMultipleChoiceModelOutput, + FlaxQuestionAnsweringModelOutput, + FlaxSequenceClassifierOutput, + FlaxTokenClassifierOutput, +) +from ...modeling_flax_utils import ( + ACT2FN, + FlaxPreTrainedModel, + append_call_sample_docstring, + append_replace_return_docstrings, + overwrite_call_docstring, +) +from ...utils import ModelOutput, add_start_docstrings, add_start_docstrings_to_model_forward, logging +from .configuration_electra import ElectraConfig + + +logger = logging.get_logger(__name__) + +_CHECKPOINT_FOR_DOC = "google/electra-small-discriminator" +_CONFIG_FOR_DOC = "ElectraConfig" + +remat = nn_partitioning.remat + + +@flax.struct.dataclass +class FlaxElectraForPreTrainingOutput(ModelOutput): + """ + Output type of [`ElectraForPreTraining`]. + + Args: + logits (`jnp.ndarray` of shape `(batch_size, sequence_length, config.vocab_size)`): + Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). + hidden_states (`tuple(jnp.ndarray)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `jnp.ndarray` (one for the output of the embeddings + one for the output of each layer) of shape + `(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + attentions (`tuple(jnp.ndarray)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `jnp.ndarray` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + """ + + logits: jnp.ndarray = None + hidden_states: Optional[tuple[jnp.ndarray]] = None + attentions: Optional[tuple[jnp.ndarray]] = None + + +ELECTRA_START_DOCSTRING = r""" + + This model inherits from [`FlaxPreTrainedModel`]. Check the superclass documentation for the generic methods the + library implements for all its model (such as downloading, saving and converting weights from PyTorch models) + + This model is also a Flax Linen + [flax.nn.Module](https://flax.readthedocs.io/en/latest/_autosummary/flax.nn.module.html) subclass. Use it as a + regular Flax Module and refer to the Flax documentation for all matter related to general usage and behavior. + + Finally, this model supports inherent JAX features such as: + + - [Just-In-Time (JIT) compilation](https://jax.readthedocs.io/en/latest/jax.html#just-in-time-compilation-jit) + - [Automatic Differentiation](https://jax.readthedocs.io/en/latest/jax.html#automatic-differentiation) + - [Vectorization](https://jax.readthedocs.io/en/latest/jax.html#vectorization-vmap) + - [Parallelization](https://jax.readthedocs.io/en/latest/jax.html#parallelization-pmap) + + Parameters: + config ([`ElectraConfig`]): Model configuration class with all the parameters of the model. + Initializing with a config file does not load the weights associated with the model, only the + configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights. +""" + +ELECTRA_INPUTS_DOCSTRING = r""" + Args: + input_ids (`numpy.ndarray` of shape `({0})`): + Indices of input sequence tokens in the vocabulary. + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + [What are input IDs?](../glossary#input-ids) + attention_mask (`numpy.ndarray` of shape `({0})`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + token_type_ids (`numpy.ndarray` of shape `({0})`, *optional*): + Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0, + 1]`: + + - 0 corresponds to a *sentence A* token, + - 1 corresponds to a *sentence B* token. + + [What are token type IDs?](../glossary#token-type-ids) + position_ids (`numpy.ndarray` of shape `({0})`, *optional*): + Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, + config.max_position_embeddings - 1]`. + head_mask (`numpy.ndarray` of shape `({0})`, `optional): + Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`: + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. + +""" + + +class FlaxElectraEmbeddings(nn.Module): + """Construct the embeddings from word, position and token_type embeddings.""" + + config: ElectraConfig + dtype: jnp.dtype = jnp.float32 # the dtype of the computation + + def setup(self): + self.word_embeddings = nn.Embed( + self.config.vocab_size, + self.config.embedding_size, + embedding_init=jax.nn.initializers.normal(stddev=self.config.initializer_range), + ) + self.position_embeddings = nn.Embed( + self.config.max_position_embeddings, + self.config.embedding_size, + embedding_init=jax.nn.initializers.normal(stddev=self.config.initializer_range), + ) + self.token_type_embeddings = nn.Embed( + self.config.type_vocab_size, + self.config.embedding_size, + embedding_init=jax.nn.initializers.normal(stddev=self.config.initializer_range), + ) + self.LayerNorm = nn.LayerNorm(epsilon=self.config.layer_norm_eps, dtype=self.dtype) + self.dropout = nn.Dropout(rate=self.config.hidden_dropout_prob) + + # Copied from transformers.models.bert.modeling_flax_bert.FlaxBertEmbeddings.__call__ + def __call__(self, input_ids, token_type_ids, position_ids, attention_mask, deterministic: bool = True): + # Embed + inputs_embeds = self.word_embeddings(input_ids.astype("i4")) + position_embeds = self.position_embeddings(position_ids.astype("i4")) + token_type_embeddings = self.token_type_embeddings(token_type_ids.astype("i4")) + + # Sum all embeddings + hidden_states = inputs_embeds + token_type_embeddings + position_embeds + + # Layer Norm + hidden_states = self.LayerNorm(hidden_states) + hidden_states = self.dropout(hidden_states, deterministic=deterministic) + return hidden_states + + +# Copied from transformers.models.bert.modeling_flax_bert.FlaxBertSelfAttention with Bert->Electra +class FlaxElectraSelfAttention(nn.Module): + config: ElectraConfig + causal: bool = False + dtype: jnp.dtype = jnp.float32 # the dtype of the computation + + def setup(self): + self.head_dim = self.config.hidden_size // self.config.num_attention_heads + if self.config.hidden_size % self.config.num_attention_heads != 0: + raise ValueError( + "`config.hidden_size`: {self.config.hidden_size} has to be a multiple of `config.num_attention_heads` " + " : {self.config.num_attention_heads}" + ) + + self.query = nn.Dense( + self.config.hidden_size, + dtype=self.dtype, + kernel_init=jax.nn.initializers.normal(self.config.initializer_range), + ) + self.key = nn.Dense( + self.config.hidden_size, + dtype=self.dtype, + kernel_init=jax.nn.initializers.normal(self.config.initializer_range), + ) + self.value = nn.Dense( + self.config.hidden_size, + dtype=self.dtype, + kernel_init=jax.nn.initializers.normal(self.config.initializer_range), + ) + + if self.causal: + self.causal_mask = make_causal_mask( + jnp.ones((1, self.config.max_position_embeddings), dtype="bool"), dtype="bool" + ) + + def _split_heads(self, hidden_states): + return hidden_states.reshape(hidden_states.shape[:2] + (self.config.num_attention_heads, self.head_dim)) + + def _merge_heads(self, hidden_states): + return hidden_states.reshape(hidden_states.shape[:2] + (self.config.hidden_size,)) + + @nn.compact + # Copied from transformers.models.bart.modeling_flax_bart.FlaxBartAttention._concatenate_to_cache + def _concatenate_to_cache(self, key, value, query, attention_mask): + """ + This function takes projected key, value states from a single input token and concatenates the states to cached + states from previous steps. This function is slightly adapted from the official Flax repository: + https://github.com/google/flax/blob/491ce18759622506588784b4fca0e4bf05f8c8cd/flax/linen/attention.py#L252 + """ + # detect if we're initializing by absence of existing cache data. + is_initialized = self.has_variable("cache", "cached_key") + cached_key = self.variable("cache", "cached_key", jnp.zeros, key.shape, key.dtype) + cached_value = self.variable("cache", "cached_value", jnp.zeros, value.shape, value.dtype) + cache_index = self.variable("cache", "cache_index", lambda: jnp.array(0, dtype=jnp.int32)) + + if is_initialized: + *batch_dims, max_length, num_heads, depth_per_head = cached_key.value.shape + # update key, value caches with our new 1d spatial slices + cur_index = cache_index.value + indices = (0,) * len(batch_dims) + (cur_index, 0, 0) + key = lax.dynamic_update_slice(cached_key.value, key, indices) + value = lax.dynamic_update_slice(cached_value.value, value, indices) + cached_key.value = key + cached_value.value = value + num_updated_cache_vectors = query.shape[1] + cache_index.value = cache_index.value + num_updated_cache_vectors + # causal mask for cached decoder self-attention: our single query position should only attend to those key positions that have already been generated and cached, not the remaining zero elements. + pad_mask = jnp.broadcast_to( + jnp.arange(max_length) < cur_index + num_updated_cache_vectors, + tuple(batch_dims) + (1, num_updated_cache_vectors, max_length), + ) + attention_mask = combine_masks(pad_mask, attention_mask) + return key, value, attention_mask + + def __call__( + self, + hidden_states, + attention_mask, + layer_head_mask, + key_value_states: Optional[jnp.ndarray] = None, + init_cache: bool = False, + deterministic=True, + output_attentions: bool = False, + ): + # if key_value_states are provided this layer is used as a cross-attention layer + # for the decoder + is_cross_attention = key_value_states is not None + batch_size = hidden_states.shape[0] + + # get query proj + query_states = self.query(hidden_states) + # get key, value proj + if is_cross_attention: + # cross_attentions + key_states = self.key(key_value_states) + value_states = self.value(key_value_states) + else: + # self_attention + key_states = self.key(hidden_states) + value_states = self.value(hidden_states) + + query_states = self._split_heads(query_states) + key_states = self._split_heads(key_states) + value_states = self._split_heads(value_states) + + # handle cache prepare causal attention mask + if self.causal: + query_length, key_length = query_states.shape[1], key_states.shape[1] + if self.has_variable("cache", "cached_key"): + mask_shift = self.variables["cache"]["cache_index"] + max_decoder_length = self.variables["cache"]["cached_key"].shape[1] + causal_mask = lax.dynamic_slice( + self.causal_mask, (0, 0, mask_shift, 0), (1, 1, query_length, max_decoder_length) + ) + else: + causal_mask = self.causal_mask[:, :, :query_length, :key_length] + causal_mask = jnp.broadcast_to(causal_mask, (batch_size,) + causal_mask.shape[1:]) + + # combine masks if needed + if attention_mask is not None and self.causal: + attention_mask = jnp.broadcast_to(jnp.expand_dims(attention_mask, axis=(-3, -2)), causal_mask.shape) + attention_mask = combine_masks(attention_mask, causal_mask) + elif self.causal: + attention_mask = causal_mask + elif attention_mask is not None: + attention_mask = jnp.expand_dims(attention_mask, axis=(-3, -2)) + + # During fast autoregressive decoding, we feed one position at a time, + # and cache the keys and values step by step. + if self.causal and (self.has_variable("cache", "cached_key") or init_cache): + key_states, value_states, attention_mask = self._concatenate_to_cache( + key_states, value_states, query_states, attention_mask + ) + + # Convert the boolean attention mask to an attention bias. + if attention_mask is not None: + # attention mask in the form of attention bias + attention_bias = lax.select( + attention_mask > 0, + jnp.full(attention_mask.shape, 0.0).astype(self.dtype), + jnp.full(attention_mask.shape, jnp.finfo(self.dtype).min).astype(self.dtype), + ) + else: + attention_bias = None + + dropout_rng = None + if not deterministic and self.config.attention_probs_dropout_prob > 0.0: + dropout_rng = self.make_rng("dropout") + + attn_weights = dot_product_attention_weights( + query_states, + key_states, + bias=attention_bias, + dropout_rng=dropout_rng, + dropout_rate=self.config.attention_probs_dropout_prob, + broadcast_dropout=True, + deterministic=deterministic, + dtype=self.dtype, + precision=None, + ) + + # Mask heads if we want to + if layer_head_mask is not None: + attn_weights = jnp.einsum("...hqk,h->...hqk", attn_weights, layer_head_mask) + + attn_output = jnp.einsum("...hqk,...khd->...qhd", attn_weights, value_states) + attn_output = attn_output.reshape(attn_output.shape[:2] + (-1,)) + + outputs = (attn_output, attn_weights) if output_attentions else (attn_output,) + return outputs + + +# Copied from transformers.models.bert.modeling_flax_bert.FlaxBertSelfOutput with Bert->Electra +class FlaxElectraSelfOutput(nn.Module): + config: ElectraConfig + dtype: jnp.dtype = jnp.float32 # the dtype of the computation + + def setup(self): + self.dense = nn.Dense( + self.config.hidden_size, + kernel_init=jax.nn.initializers.normal(self.config.initializer_range), + dtype=self.dtype, + ) + self.LayerNorm = nn.LayerNorm(epsilon=self.config.layer_norm_eps, dtype=self.dtype) + self.dropout = nn.Dropout(rate=self.config.hidden_dropout_prob) + + def __call__(self, hidden_states, input_tensor, deterministic: bool = True): + hidden_states = self.dense(hidden_states) + hidden_states = self.dropout(hidden_states, deterministic=deterministic) + hidden_states = self.LayerNorm(hidden_states + input_tensor) + return hidden_states + + +# Copied from transformers.models.bert.modeling_flax_bert.FlaxBertAttention with Bert->Electra +class FlaxElectraAttention(nn.Module): + config: ElectraConfig + causal: bool = False + dtype: jnp.dtype = jnp.float32 + + def setup(self): + self.self = FlaxElectraSelfAttention(self.config, causal=self.causal, dtype=self.dtype) + self.output = FlaxElectraSelfOutput(self.config, dtype=self.dtype) + + def __call__( + self, + hidden_states, + attention_mask, + layer_head_mask, + key_value_states=None, + init_cache=False, + deterministic=True, + output_attentions: bool = False, + ): + # Attention mask comes in as attention_mask.shape == (*batch_sizes, kv_length) + # FLAX expects: attention_mask.shape == (*batch_sizes, 1, 1, kv_length) such that it is broadcastable + # with attn_weights.shape == (*batch_sizes, num_heads, q_length, kv_length) + attn_outputs = self.self( + hidden_states, + attention_mask, + layer_head_mask=layer_head_mask, + key_value_states=key_value_states, + init_cache=init_cache, + deterministic=deterministic, + output_attentions=output_attentions, + ) + attn_output = attn_outputs[0] + hidden_states = self.output(attn_output, hidden_states, deterministic=deterministic) + + outputs = (hidden_states,) + + if output_attentions: + outputs += (attn_outputs[1],) + + return outputs + + +# Copied from transformers.models.bert.modeling_flax_bert.FlaxBertIntermediate with Bert->Electra +class FlaxElectraIntermediate(nn.Module): + config: ElectraConfig + dtype: jnp.dtype = jnp.float32 # the dtype of the computation + + def setup(self): + self.dense = nn.Dense( + self.config.intermediate_size, + kernel_init=jax.nn.initializers.normal(self.config.initializer_range), + dtype=self.dtype, + ) + self.activation = ACT2FN[self.config.hidden_act] + + def __call__(self, hidden_states): + hidden_states = self.dense(hidden_states) + hidden_states = self.activation(hidden_states) + return hidden_states + + +# Copied from transformers.models.bert.modeling_flax_bert.FlaxBertOutput with Bert->Electra +class FlaxElectraOutput(nn.Module): + config: ElectraConfig + dtype: jnp.dtype = jnp.float32 # the dtype of the computation + + def setup(self): + self.dense = nn.Dense( + self.config.hidden_size, + kernel_init=jax.nn.initializers.normal(self.config.initializer_range), + dtype=self.dtype, + ) + self.dropout = nn.Dropout(rate=self.config.hidden_dropout_prob) + self.LayerNorm = nn.LayerNorm(epsilon=self.config.layer_norm_eps, dtype=self.dtype) + + def __call__(self, hidden_states, attention_output, deterministic: bool = True): + hidden_states = self.dense(hidden_states) + hidden_states = self.dropout(hidden_states, deterministic=deterministic) + hidden_states = self.LayerNorm(hidden_states + attention_output) + return hidden_states + + +# Copied from transformers.models.bert.modeling_flax_bert.FlaxBertLayer with Bert->Electra +class FlaxElectraLayer(nn.Module): + config: ElectraConfig + dtype: jnp.dtype = jnp.float32 # the dtype of the computation + + def setup(self): + self.attention = FlaxElectraAttention(self.config, causal=self.config.is_decoder, dtype=self.dtype) + self.intermediate = FlaxElectraIntermediate(self.config, dtype=self.dtype) + self.output = FlaxElectraOutput(self.config, dtype=self.dtype) + if self.config.add_cross_attention: + self.crossattention = FlaxElectraAttention(self.config, causal=False, dtype=self.dtype) + + def __call__( + self, + hidden_states, + attention_mask, + layer_head_mask, + encoder_hidden_states: Optional[jnp.ndarray] = None, + encoder_attention_mask: Optional[jnp.ndarray] = None, + init_cache: bool = False, + deterministic: bool = True, + output_attentions: bool = False, + ): + # Self Attention + attention_outputs = self.attention( + hidden_states, + attention_mask, + layer_head_mask=layer_head_mask, + init_cache=init_cache, + deterministic=deterministic, + output_attentions=output_attentions, + ) + attention_output = attention_outputs[0] + + # Cross-Attention Block + if encoder_hidden_states is not None: + cross_attention_outputs = self.crossattention( + attention_output, + attention_mask=encoder_attention_mask, + layer_head_mask=layer_head_mask, + key_value_states=encoder_hidden_states, + deterministic=deterministic, + output_attentions=output_attentions, + ) + attention_output = cross_attention_outputs[0] + + hidden_states = self.intermediate(attention_output) + hidden_states = self.output(hidden_states, attention_output, deterministic=deterministic) + + outputs = (hidden_states,) + + if output_attentions: + outputs += (attention_outputs[1],) + if encoder_hidden_states is not None: + outputs += (cross_attention_outputs[1],) + return outputs + + +# Copied from transformers.models.bert.modeling_flax_bert.FlaxBertLayerCollection with Bert->Electra +class FlaxElectraLayerCollection(nn.Module): + config: ElectraConfig + dtype: jnp.dtype = jnp.float32 # the dtype of the computation + gradient_checkpointing: bool = False + + def setup(self): + if self.gradient_checkpointing: + FlaxElectraCheckpointLayer = remat(FlaxElectraLayer, static_argnums=(5, 6, 7)) + self.layers = [ + FlaxElectraCheckpointLayer(self.config, name=str(i), dtype=self.dtype) + for i in range(self.config.num_hidden_layers) + ] + else: + self.layers = [ + FlaxElectraLayer(self.config, name=str(i), dtype=self.dtype) + for i in range(self.config.num_hidden_layers) + ] + + def __call__( + self, + hidden_states, + attention_mask, + head_mask, + encoder_hidden_states: Optional[jnp.ndarray] = None, + encoder_attention_mask: Optional[jnp.ndarray] = None, + init_cache: bool = False, + deterministic: bool = True, + output_attentions: bool = False, + output_hidden_states: bool = False, + return_dict: bool = True, + ): + all_attentions = () if output_attentions else None + all_hidden_states = () if output_hidden_states else None + all_cross_attentions = () if (output_attentions and encoder_hidden_states is not None) else None + + # Check if head_mask has a correct number of layers specified if desired + if head_mask is not None: + if head_mask.shape[0] != (len(self.layers)): + raise ValueError( + f"The head_mask should be specified for {len(self.layers)} layers, but it is for " + f" {head_mask.shape[0]}." + ) + + for i, layer in enumerate(self.layers): + if output_hidden_states: + all_hidden_states += (hidden_states,) + + layer_outputs = layer( + hidden_states, + attention_mask, + head_mask[i] if head_mask is not None else None, + encoder_hidden_states, + encoder_attention_mask, + init_cache, + deterministic, + output_attentions, + ) + + hidden_states = layer_outputs[0] + + if output_attentions: + all_attentions += (layer_outputs[1],) + + if encoder_hidden_states is not None: + all_cross_attentions += (layer_outputs[2],) + + if output_hidden_states: + all_hidden_states += (hidden_states,) + + outputs = (hidden_states, all_hidden_states, all_attentions, all_cross_attentions) + + if not return_dict: + return tuple(v for v in outputs if v is not None) + + return FlaxBaseModelOutputWithPastAndCrossAttentions( + last_hidden_state=hidden_states, + hidden_states=all_hidden_states, + attentions=all_attentions, + cross_attentions=all_cross_attentions, + ) + + +# Copied from transformers.models.bert.modeling_flax_bert.FlaxBertEncoder with Bert->Electra +class FlaxElectraEncoder(nn.Module): + config: ElectraConfig + dtype: jnp.dtype = jnp.float32 # the dtype of the computation + gradient_checkpointing: bool = False + + def setup(self): + self.layer = FlaxElectraLayerCollection( + self.config, + dtype=self.dtype, + gradient_checkpointing=self.gradient_checkpointing, + ) + + def __call__( + self, + hidden_states, + attention_mask, + head_mask, + encoder_hidden_states: Optional[jnp.ndarray] = None, + encoder_attention_mask: Optional[jnp.ndarray] = None, + init_cache: bool = False, + deterministic: bool = True, + output_attentions: bool = False, + output_hidden_states: bool = False, + return_dict: bool = True, + ): + return self.layer( + hidden_states, + attention_mask, + head_mask=head_mask, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_attention_mask, + init_cache=init_cache, + deterministic=deterministic, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + +class FlaxElectraGeneratorPredictions(nn.Module): + config: ElectraConfig + dtype: jnp.dtype = jnp.float32 + + def setup(self): + self.LayerNorm = nn.LayerNorm(epsilon=self.config.layer_norm_eps, dtype=self.dtype) + self.dense = nn.Dense(self.config.embedding_size, dtype=self.dtype) + + def __call__(self, hidden_states): + hidden_states = self.dense(hidden_states) + hidden_states = ACT2FN[self.config.hidden_act](hidden_states) + hidden_states = self.LayerNorm(hidden_states) + return hidden_states + + +class FlaxElectraDiscriminatorPredictions(nn.Module): + """Prediction module for the discriminator, made up of two dense layers.""" + + config: ElectraConfig + dtype: jnp.dtype = jnp.float32 + + def setup(self): + self.dense = nn.Dense(self.config.hidden_size, dtype=self.dtype) + self.dense_prediction = nn.Dense(1, dtype=self.dtype) + + def __call__(self, hidden_states): + hidden_states = self.dense(hidden_states) + hidden_states = ACT2FN[self.config.hidden_act](hidden_states) + hidden_states = self.dense_prediction(hidden_states).squeeze(-1) + return hidden_states + + +class FlaxElectraPreTrainedModel(FlaxPreTrainedModel): + """ + An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained + models. + """ + + config_class = ElectraConfig + base_model_prefix = "electra" + module_class: nn.Module = None + + def __init__( + self, + config: ElectraConfig, + input_shape: tuple = (1, 1), + seed: int = 0, + dtype: jnp.dtype = jnp.float32, + _do_init: bool = True, + gradient_checkpointing: bool = False, + **kwargs, + ): + module = self.module_class(config=config, dtype=dtype, gradient_checkpointing=gradient_checkpointing, **kwargs) + super().__init__(config, module, input_shape=input_shape, seed=seed, dtype=dtype, _do_init=_do_init) + + # Copied from transformers.models.bert.modeling_flax_bert.FlaxBertPreTrainedModel.enable_gradient_checkpointing + def enable_gradient_checkpointing(self): + self._module = self.module_class( + config=self.config, + dtype=self.dtype, + gradient_checkpointing=True, + ) + + # Copied from transformers.models.bert.modeling_flax_bert.FlaxBertPreTrainedModel.init_weights + def init_weights(self, rng: jax.random.PRNGKey, input_shape: tuple, params: FrozenDict = None) -> FrozenDict: + # init input tensors + input_ids = jnp.zeros(input_shape, dtype="i4") + token_type_ids = jnp.zeros_like(input_ids) + position_ids = jnp.broadcast_to(jnp.arange(jnp.atleast_2d(input_ids).shape[-1]), input_shape) + attention_mask = jnp.ones_like(input_ids) + head_mask = jnp.ones((self.config.num_hidden_layers, self.config.num_attention_heads)) + + params_rng, dropout_rng = jax.random.split(rng) + rngs = {"params": params_rng, "dropout": dropout_rng} + + if self.config.add_cross_attention: + encoder_hidden_states = jnp.zeros(input_shape + (self.config.hidden_size,)) + encoder_attention_mask = attention_mask + module_init_outputs = self.module.init( + rngs, + input_ids, + attention_mask, + token_type_ids, + position_ids, + head_mask, + encoder_hidden_states, + encoder_attention_mask, + return_dict=False, + ) + else: + module_init_outputs = self.module.init( + rngs, input_ids, attention_mask, token_type_ids, position_ids, head_mask, return_dict=False + ) + + random_params = module_init_outputs["params"] + + if params is not None: + random_params = flatten_dict(unfreeze(random_params)) + params = flatten_dict(unfreeze(params)) + for missing_key in self._missing_keys: + params[missing_key] = random_params[missing_key] + self._missing_keys = set() + return freeze(unflatten_dict(params)) + else: + return random_params + + # Copied from transformers.models.bart.modeling_flax_bart.FlaxBartDecoderPreTrainedModel.init_cache + def init_cache(self, batch_size, max_length): + r""" + Args: + batch_size (`int`): + batch_size used for fast auto-regressive decoding. Defines the batch size of the initialized cache. + max_length (`int`): + maximum possible length for auto-regressive decoding. Defines the sequence length of the initialized + cache. + """ + # init input variables to retrieve cache + input_ids = jnp.ones((batch_size, max_length), dtype="i4") + attention_mask = jnp.ones_like(input_ids, dtype="i4") + position_ids = jnp.broadcast_to(jnp.arange(jnp.atleast_2d(input_ids).shape[-1]), input_ids.shape) + + init_variables = self.module.init( + jax.random.PRNGKey(0), input_ids, attention_mask, position_ids, return_dict=False, init_cache=True + ) + return unfreeze(init_variables["cache"]) + + @add_start_docstrings_to_model_forward(ELECTRA_INPUTS_DOCSTRING.format("batch_size, sequence_length")) + def __call__( + self, + input_ids, + attention_mask=None, + token_type_ids=None, + position_ids=None, + head_mask=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + params: Optional[dict] = None, + dropout_rng: jax.random.PRNGKey = None, + train: bool = False, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + past_key_values: Optional[dict] = None, + ): + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.return_dict + + # init input tensors if not passed + if token_type_ids is None: + token_type_ids = jnp.ones_like(input_ids) + + if position_ids is None: + position_ids = jnp.broadcast_to(jnp.arange(jnp.atleast_2d(input_ids).shape[-1]), input_ids.shape) + + if attention_mask is None: + attention_mask = jnp.ones_like(input_ids) + + if head_mask is None: + head_mask = jnp.ones((self.config.num_hidden_layers, self.config.num_attention_heads)) + + # Handle any PRNG if needed + rngs = {} + if dropout_rng is not None: + rngs["dropout"] = dropout_rng + + inputs = {"params": params or self.params} + + if self.config.add_cross_attention: + # if past_key_values are passed then cache is already initialized a private flag init_cache has to be passed + # down to ensure cache is used. It has to be made sure that cache is marked as mutable so that it can be + # changed by FlaxElectraAttention module + if past_key_values: + inputs["cache"] = past_key_values + mutable = ["cache"] + else: + mutable = False + + outputs = self.module.apply( + inputs, + jnp.array(input_ids, dtype="i4"), + jnp.array(attention_mask, dtype="i4"), + token_type_ids=jnp.array(token_type_ids, dtype="i4"), + position_ids=jnp.array(position_ids, dtype="i4"), + head_mask=jnp.array(head_mask, dtype="i4"), + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_attention_mask, + deterministic=not train, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + rngs=rngs, + mutable=mutable, + ) + + # add updated cache to model output + if past_key_values is not None and return_dict: + outputs, past_key_values = outputs + outputs["past_key_values"] = unfreeze(past_key_values["cache"]) + return outputs + elif past_key_values is not None and not return_dict: + outputs, past_key_values = outputs + outputs = outputs[:1] + (unfreeze(past_key_values["cache"]),) + outputs[1:] + + else: + outputs = self.module.apply( + inputs, + jnp.array(input_ids, dtype="i4"), + jnp.array(attention_mask, dtype="i4"), + token_type_ids=jnp.array(token_type_ids, dtype="i4"), + position_ids=jnp.array(position_ids, dtype="i4"), + head_mask=jnp.array(head_mask, dtype="i4"), + deterministic=not train, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + rngs=rngs, + ) + + return outputs + + +class FlaxElectraModule(nn.Module): + config: ElectraConfig + dtype: jnp.dtype = jnp.float32 # the dtype of the computation + gradient_checkpointing: bool = False + + def setup(self): + self.embeddings = FlaxElectraEmbeddings(self.config, dtype=self.dtype) + if self.config.embedding_size != self.config.hidden_size: + self.embeddings_project = nn.Dense(self.config.hidden_size, dtype=self.dtype) + self.encoder = FlaxElectraEncoder( + self.config, dtype=self.dtype, gradient_checkpointing=self.gradient_checkpointing + ) + + def __call__( + self, + input_ids, + attention_mask, + token_type_ids, + position_ids, + head_mask: Optional[np.ndarray] = None, + encoder_hidden_states: Optional[jnp.ndarray] = None, + encoder_attention_mask: Optional[jnp.ndarray] = None, + init_cache: bool = False, + deterministic: bool = True, + output_attentions: bool = False, + output_hidden_states: bool = False, + return_dict: bool = True, + ): + embeddings = self.embeddings( + input_ids, token_type_ids, position_ids, attention_mask, deterministic=deterministic + ) + if hasattr(self, "embeddings_project"): + embeddings = self.embeddings_project(embeddings) + + return self.encoder( + embeddings, + attention_mask, + head_mask=head_mask, + deterministic=deterministic, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_attention_mask, + init_cache=init_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + +@add_start_docstrings( + "The bare Electra Model transformer outputting raw hidden-states without any specific head on top.", + ELECTRA_START_DOCSTRING, +) +class FlaxElectraModel(FlaxElectraPreTrainedModel): + module_class = FlaxElectraModule + + +append_call_sample_docstring(FlaxElectraModel, _CHECKPOINT_FOR_DOC, FlaxBaseModelOutput, _CONFIG_FOR_DOC) + + +class FlaxElectraTiedDense(nn.Module): + embedding_size: int + dtype: jnp.dtype = jnp.float32 + precision = None + bias_init: Callable[..., np.ndarray] = jax.nn.initializers.zeros + + def setup(self): + self.bias = self.param("bias", self.bias_init, (self.embedding_size,)) + + def __call__(self, x, kernel): + x = jnp.asarray(x, self.dtype) + kernel = jnp.asarray(kernel, self.dtype) + y = lax.dot_general( + x, + kernel, + (((x.ndim - 1,), (0,)), ((), ())), + precision=self.precision, + ) + bias = jnp.asarray(self.bias, self.dtype) + return y + bias + + +class FlaxElectraForMaskedLMModule(nn.Module): + config: ElectraConfig + dtype: jnp.dtype = jnp.float32 + gradient_checkpointing: bool = False + + def setup(self): + self.electra = FlaxElectraModule( + config=self.config, dtype=self.dtype, gradient_checkpointing=self.gradient_checkpointing + ) + self.generator_predictions = FlaxElectraGeneratorPredictions(config=self.config, dtype=self.dtype) + if self.config.tie_word_embeddings: + self.generator_lm_head = FlaxElectraTiedDense(self.config.vocab_size, dtype=self.dtype) + else: + self.generator_lm_head = nn.Dense(self.config.vocab_size, dtype=self.dtype) + + def __call__( + self, + input_ids, + attention_mask=None, + token_type_ids=None, + position_ids=None, + head_mask=None, + deterministic: bool = True, + output_attentions: bool = False, + output_hidden_states: bool = False, + return_dict: bool = True, + ): + outputs = self.electra( + input_ids, + attention_mask, + token_type_ids, + position_ids, + head_mask, + deterministic=deterministic, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + hidden_states = outputs[0] + prediction_scores = self.generator_predictions(hidden_states) + + if self.config.tie_word_embeddings: + shared_embedding = self.electra.variables["params"]["embeddings"]["word_embeddings"]["embedding"] + prediction_scores = self.generator_lm_head(prediction_scores, shared_embedding.T) + else: + prediction_scores = self.generator_lm_head(prediction_scores) + + if not return_dict: + return (prediction_scores,) + outputs[1:] + + return FlaxMaskedLMOutput( + logits=prediction_scores, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +@add_start_docstrings("""Electra Model with a `language modeling` head on top.""", ELECTRA_START_DOCSTRING) +class FlaxElectraForMaskedLM(FlaxElectraPreTrainedModel): + module_class = FlaxElectraForMaskedLMModule + + +append_call_sample_docstring(FlaxElectraForMaskedLM, _CHECKPOINT_FOR_DOC, FlaxMaskedLMOutput, _CONFIG_FOR_DOC) + + +class FlaxElectraForPreTrainingModule(nn.Module): + config: ElectraConfig + dtype: jnp.dtype = jnp.float32 + gradient_checkpointing: bool = False + + def setup(self): + self.electra = FlaxElectraModule( + config=self.config, dtype=self.dtype, gradient_checkpointing=self.gradient_checkpointing + ) + self.discriminator_predictions = FlaxElectraDiscriminatorPredictions(config=self.config, dtype=self.dtype) + + def __call__( + self, + input_ids, + attention_mask=None, + token_type_ids=None, + position_ids=None, + head_mask=None, + deterministic: bool = True, + output_attentions: bool = False, + output_hidden_states: bool = False, + return_dict: bool = True, + ): + # Model + outputs = self.electra( + input_ids, + attention_mask, + token_type_ids, + position_ids, + head_mask, + deterministic=deterministic, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + hidden_states = outputs[0] + + logits = self.discriminator_predictions(hidden_states) + + if not return_dict: + return (logits,) + outputs[1:] + + return FlaxElectraForPreTrainingOutput( + logits=logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +@add_start_docstrings( + """ + Electra model with a binary classification head on top as used during pretraining for identifying generated tokens. + + It is recommended to load the discriminator checkpoint into that model. + """, + ELECTRA_START_DOCSTRING, +) +class FlaxElectraForPreTraining(FlaxElectraPreTrainedModel): + module_class = FlaxElectraForPreTrainingModule + + +FLAX_ELECTRA_FOR_PRETRAINING_DOCSTRING = """ + Returns: + + Example: + + ```python + >>> from transformers import AutoTokenizer, FlaxElectraForPreTraining + + >>> tokenizer = AutoTokenizer.from_pretrained("google/electra-small-discriminator") + >>> model = FlaxElectraForPreTraining.from_pretrained("google/electra-small-discriminator") + + >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="np") + >>> outputs = model(**inputs) + + >>> prediction_logits = outputs.logits + ``` +""" + +overwrite_call_docstring( + FlaxElectraForPreTraining, + ELECTRA_INPUTS_DOCSTRING.format("batch_size, sequence_length") + FLAX_ELECTRA_FOR_PRETRAINING_DOCSTRING, +) +append_replace_return_docstrings( + FlaxElectraForPreTraining, output_type=FlaxElectraForPreTrainingOutput, config_class=_CONFIG_FOR_DOC +) + + +class FlaxElectraForTokenClassificationModule(nn.Module): + config: ElectraConfig + dtype: jnp.dtype = jnp.float32 + gradient_checkpointing: bool = False + + def setup(self): + self.electra = FlaxElectraModule( + config=self.config, dtype=self.dtype, gradient_checkpointing=self.gradient_checkpointing + ) + classifier_dropout = ( + self.config.classifier_dropout + if self.config.classifier_dropout is not None + else self.config.hidden_dropout_prob + ) + self.dropout = nn.Dropout(classifier_dropout) + self.classifier = nn.Dense(self.config.num_labels, dtype=self.dtype) + + def __call__( + self, + input_ids, + attention_mask=None, + token_type_ids=None, + position_ids=None, + head_mask=None, + deterministic: bool = True, + output_attentions: bool = False, + output_hidden_states: bool = False, + return_dict: bool = True, + ): + # Model + outputs = self.electra( + input_ids, + attention_mask, + token_type_ids, + position_ids, + head_mask, + deterministic=deterministic, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + hidden_states = outputs[0] + + hidden_states = self.dropout(hidden_states, deterministic=deterministic) + logits = self.classifier(hidden_states) + + if not return_dict: + return (logits,) + outputs[1:] + + return FlaxTokenClassifierOutput( + logits=logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +@add_start_docstrings( + """ + Electra model with a token classification head on top. + + Both the discriminator and generator may be loaded into this model. + """, + ELECTRA_START_DOCSTRING, +) +class FlaxElectraForTokenClassification(FlaxElectraPreTrainedModel): + module_class = FlaxElectraForTokenClassificationModule + + +append_call_sample_docstring( + FlaxElectraForTokenClassification, + _CHECKPOINT_FOR_DOC, + FlaxTokenClassifierOutput, + _CONFIG_FOR_DOC, +) + + +def identity(x, **kwargs): + return x + + +class FlaxElectraSequenceSummary(nn.Module): + r""" + Compute a single vector summary of a sequence hidden states. + + Args: + config ([`PretrainedConfig`]): + The config used by the model. Relevant arguments in the config class of the model are (refer to the actual + config class of your model for the default values it uses): + + - **summary_use_proj** (`bool`) -- Add a projection after the vector extraction. + - **summary_proj_to_labels** (`bool`) -- If `True`, the projection outputs to `config.num_labels` classes + (otherwise to `config.hidden_size`). + - **summary_activation** (`Optional[str]`) -- Set to `"tanh"` to add a tanh activation to the output, + another string or `None` will add no activation. + - **summary_first_dropout** (`float`) -- Optional dropout probability before the projection and activation. + - **summary_last_dropout** (`float`)-- Optional dropout probability after the projection and activation. + """ + + config: ElectraConfig + dtype: jnp.dtype = jnp.float32 + + def setup(self): + self.summary = identity + if hasattr(self.config, "summary_use_proj") and self.config.summary_use_proj: + if ( + hasattr(self.config, "summary_proj_to_labels") + and self.config.summary_proj_to_labels + and self.config.num_labels > 0 + ): + num_classes = self.config.num_labels + else: + num_classes = self.config.hidden_size + self.summary = nn.Dense(num_classes, dtype=self.dtype) + + activation_string = getattr(self.config, "summary_activation", None) + self.activation = ACT2FN[activation_string] if activation_string else lambda x: x # noqa F407 + + self.first_dropout = identity + if hasattr(self.config, "summary_first_dropout") and self.config.summary_first_dropout > 0: + self.first_dropout = nn.Dropout(self.config.summary_first_dropout) + + self.last_dropout = identity + if hasattr(self.config, "summary_last_dropout") and self.config.summary_last_dropout > 0: + self.last_dropout = nn.Dropout(self.config.summary_last_dropout) + + def __call__(self, hidden_states, cls_index=None, deterministic: bool = True): + """ + Compute a single vector summary of a sequence hidden states. + + Args: + hidden_states (`jnp.ndarray` of shape `[batch_size, seq_len, hidden_size]`): + The hidden states of the last layer. + cls_index (`jnp.ndarray` of shape `[batch_size]` or `[batch_size, ...]` where ... are optional leading dimensions of `hidden_states`, *optional*): + Used if `summary_type == "cls_index"` and takes the last token of the sequence as classification token. + + Returns: + `jnp.ndarray`: The summary of the sequence hidden states. + """ + # NOTE: this does "first" type summary always + output = hidden_states[:, 0] + output = self.first_dropout(output, deterministic=deterministic) + output = self.summary(output) + output = self.activation(output) + output = self.last_dropout(output, deterministic=deterministic) + return output + + +class FlaxElectraForMultipleChoiceModule(nn.Module): + config: ElectraConfig + dtype: jnp.dtype = jnp.float32 + gradient_checkpointing: bool = False + + def setup(self): + self.electra = FlaxElectraModule( + config=self.config, dtype=self.dtype, gradient_checkpointing=self.gradient_checkpointing + ) + self.sequence_summary = FlaxElectraSequenceSummary(config=self.config, dtype=self.dtype) + self.classifier = nn.Dense(1, dtype=self.dtype) + + def __call__( + self, + input_ids, + attention_mask=None, + token_type_ids=None, + position_ids=None, + head_mask=None, + deterministic: bool = True, + output_attentions: bool = False, + output_hidden_states: bool = False, + return_dict: bool = True, + ): + num_choices = input_ids.shape[1] + input_ids = input_ids.reshape(-1, input_ids.shape[-1]) if input_ids is not None else None + attention_mask = attention_mask.reshape(-1, attention_mask.shape[-1]) if attention_mask is not None else None + token_type_ids = token_type_ids.reshape(-1, token_type_ids.shape[-1]) if token_type_ids is not None else None + position_ids = position_ids.reshape(-1, position_ids.shape[-1]) if position_ids is not None else None + + # Model + outputs = self.electra( + input_ids, + attention_mask, + token_type_ids, + position_ids, + head_mask, + deterministic=deterministic, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + hidden_states = outputs[0] + pooled_output = self.sequence_summary(hidden_states, deterministic=deterministic) + logits = self.classifier(pooled_output) + + reshaped_logits = logits.reshape(-1, num_choices) + + if not return_dict: + return (reshaped_logits,) + outputs[1:] + + return FlaxMultipleChoiceModelOutput( + logits=reshaped_logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +@add_start_docstrings( + """ + ELECTRA Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a + softmax) e.g. for RocStories/SWAG tasks. + """, + ELECTRA_START_DOCSTRING, +) +class FlaxElectraForMultipleChoice(FlaxElectraPreTrainedModel): + module_class = FlaxElectraForMultipleChoiceModule + + +# adapt docstring slightly for FlaxElectraForMultipleChoice +overwrite_call_docstring( + FlaxElectraForMultipleChoice, ELECTRA_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length") +) +append_call_sample_docstring( + FlaxElectraForMultipleChoice, + _CHECKPOINT_FOR_DOC, + FlaxMultipleChoiceModelOutput, + _CONFIG_FOR_DOC, +) + + +class FlaxElectraForQuestionAnsweringModule(nn.Module): + config: ElectraConfig + dtype: jnp.dtype = jnp.float32 + gradient_checkpointing: bool = False + + def setup(self): + self.electra = FlaxElectraModule( + config=self.config, dtype=self.dtype, gradient_checkpointing=self.gradient_checkpointing + ) + self.qa_outputs = nn.Dense(self.config.num_labels, dtype=self.dtype) + + def __call__( + self, + input_ids, + attention_mask=None, + token_type_ids=None, + position_ids=None, + head_mask=None, + deterministic: bool = True, + output_attentions: bool = False, + output_hidden_states: bool = False, + return_dict: bool = True, + ): + # Model + outputs = self.electra( + input_ids, + attention_mask, + token_type_ids, + position_ids, + head_mask, + deterministic=deterministic, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + hidden_states = outputs[0] + logits = self.qa_outputs(hidden_states) + start_logits, end_logits = jnp.split(logits, self.config.num_labels, axis=-1) + start_logits = start_logits.squeeze(-1) + end_logits = end_logits.squeeze(-1) + + if not return_dict: + return (start_logits, end_logits) + outputs[1:] + + return FlaxQuestionAnsweringModelOutput( + start_logits=start_logits, + end_logits=end_logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +@add_start_docstrings( + """ + ELECTRA Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear + layers on top of the hidden-states output to compute `span start logits` and `span end logits`). + """, + ELECTRA_START_DOCSTRING, +) +class FlaxElectraForQuestionAnswering(FlaxElectraPreTrainedModel): + module_class = FlaxElectraForQuestionAnsweringModule + + +append_call_sample_docstring( + FlaxElectraForQuestionAnswering, + _CHECKPOINT_FOR_DOC, + FlaxQuestionAnsweringModelOutput, + _CONFIG_FOR_DOC, +) + + +class FlaxElectraClassificationHead(nn.Module): + """Head for sentence-level classification tasks.""" + + config: ElectraConfig + dtype: jnp.dtype = jnp.float32 + + def setup(self): + self.dense = nn.Dense(self.config.hidden_size, dtype=self.dtype) + classifier_dropout = ( + self.config.classifier_dropout + if self.config.classifier_dropout is not None + else self.config.hidden_dropout_prob + ) + self.dropout = nn.Dropout(classifier_dropout) + self.out_proj = nn.Dense(self.config.num_labels, dtype=self.dtype) + + def __call__(self, hidden_states, deterministic: bool = True): + x = hidden_states[:, 0, :] # take token (equiv. to [CLS]) + x = self.dropout(x, deterministic=deterministic) + x = self.dense(x) + x = ACT2FN["gelu"](x) # although BERT uses tanh here, it seems Electra authors used gelu + x = self.dropout(x, deterministic=deterministic) + x = self.out_proj(x) + return x + + +class FlaxElectraForSequenceClassificationModule(nn.Module): + config: ElectraConfig + dtype: jnp.dtype = jnp.float32 + gradient_checkpointing: bool = False + + def setup(self): + self.electra = FlaxElectraModule( + config=self.config, dtype=self.dtype, gradient_checkpointing=self.gradient_checkpointing + ) + self.classifier = FlaxElectraClassificationHead(config=self.config, dtype=self.dtype) + + def __call__( + self, + input_ids, + attention_mask=None, + token_type_ids=None, + position_ids=None, + head_mask=None, + deterministic: bool = True, + output_attentions: bool = False, + output_hidden_states: bool = False, + return_dict: bool = True, + ): + # Model + outputs = self.electra( + input_ids, + attention_mask, + token_type_ids, + position_ids, + head_mask, + deterministic=deterministic, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + hidden_states = outputs[0] + logits = self.classifier(hidden_states, deterministic=deterministic) + + if not return_dict: + return (logits,) + outputs[1:] + + return FlaxSequenceClassifierOutput( + logits=logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +@add_start_docstrings( + """ + Electra Model transformer with a sequence classification/regression head on top (a linear layer on top of the + pooled output) e.g. for GLUE tasks. + """, + ELECTRA_START_DOCSTRING, +) +class FlaxElectraForSequenceClassification(FlaxElectraPreTrainedModel): + module_class = FlaxElectraForSequenceClassificationModule + + +append_call_sample_docstring( + FlaxElectraForSequenceClassification, + _CHECKPOINT_FOR_DOC, + FlaxSequenceClassifierOutput, + _CONFIG_FOR_DOC, +) + + +class FlaxElectraForCausalLMModule(nn.Module): + config: ElectraConfig + dtype: jnp.dtype = jnp.float32 + gradient_checkpointing: bool = False + + def setup(self): + self.electra = FlaxElectraModule( + config=self.config, dtype=self.dtype, gradient_checkpointing=self.gradient_checkpointing + ) + self.generator_predictions = FlaxElectraGeneratorPredictions(config=self.config, dtype=self.dtype) + if self.config.tie_word_embeddings: + self.generator_lm_head = FlaxElectraTiedDense(self.config.vocab_size, dtype=self.dtype) + else: + self.generator_lm_head = nn.Dense(self.config.vocab_size, dtype=self.dtype) + + def __call__( + self, + input_ids, + attention_mask: Optional[jnp.ndarray] = None, + token_type_ids: Optional[jnp.ndarray] = None, + position_ids: Optional[jnp.ndarray] = None, + head_mask: Optional[jnp.ndarray] = None, + encoder_hidden_states: Optional[jnp.ndarray] = None, + encoder_attention_mask: Optional[jnp.ndarray] = None, + init_cache: bool = False, + deterministic: bool = True, + output_attentions: bool = False, + output_hidden_states: bool = False, + return_dict: bool = True, + ): + outputs = self.electra( + input_ids, + attention_mask, + token_type_ids, + position_ids, + head_mask, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_attention_mask, + init_cache=init_cache, + deterministic=deterministic, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + hidden_states = outputs[0] + prediction_scores = self.generator_predictions(hidden_states) + + if self.config.tie_word_embeddings: + shared_embedding = self.electra.variables["params"]["embeddings"]["word_embeddings"]["embedding"] + prediction_scores = self.generator_lm_head(prediction_scores, shared_embedding.T) + else: + prediction_scores = self.generator_lm_head(prediction_scores) + + if not return_dict: + return (prediction_scores,) + outputs[1:] + + return FlaxCausalLMOutputWithCrossAttentions( + logits=prediction_scores, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + cross_attentions=outputs.cross_attentions, + ) + + +@add_start_docstrings( + """ + Electra Model with a language modeling head on top (a linear layer on top of the hidden-states output) e.g for + autoregressive tasks. + """, + ELECTRA_START_DOCSTRING, +) +# Copied from transformers.models.bert.modeling_flax_bert.FlaxBertForCausalLM with Bert->Electra +class FlaxElectraForCausalLM(FlaxElectraPreTrainedModel): + module_class = FlaxElectraForCausalLMModule + + def prepare_inputs_for_generation(self, input_ids, max_length, attention_mask: Optional[jax.Array] = None): + # initializing the cache + batch_size, seq_length = input_ids.shape + + past_key_values = self.init_cache(batch_size, max_length) + # Note that usually one would have to put 0's in the attention_mask for x > input_ids.shape[-1] and x < cache_length. + # But since the decoder uses a causal mask, those positions are masked anyway. + # Thus, we can create a single static attention_mask here, which is more efficient for compilation + extended_attention_mask = jnp.ones((batch_size, max_length), dtype="i4") + if attention_mask is not None: + position_ids = attention_mask.cumsum(axis=-1) - 1 + extended_attention_mask = lax.dynamic_update_slice(extended_attention_mask, attention_mask, (0, 0)) + else: + position_ids = jnp.broadcast_to(jnp.arange(seq_length, dtype="i4")[None, :], (batch_size, seq_length)) + + return { + "past_key_values": past_key_values, + "attention_mask": extended_attention_mask, + "position_ids": position_ids, + } + + def update_inputs_for_generation(self, model_outputs, model_kwargs): + model_kwargs["past_key_values"] = model_outputs.past_key_values + model_kwargs["position_ids"] = model_kwargs["position_ids"][:, -1:] + 1 + return model_kwargs + + +append_call_sample_docstring( + FlaxElectraForCausalLM, + _CHECKPOINT_FOR_DOC, + FlaxCausalLMOutputWithCrossAttentions, + _CONFIG_FOR_DOC, +) + + +__all__ = [ + "FlaxElectraForCausalLM", + "FlaxElectraForMaskedLM", + "FlaxElectraForMultipleChoice", + "FlaxElectraForPreTraining", + "FlaxElectraForQuestionAnswering", + "FlaxElectraForSequenceClassification", + "FlaxElectraForTokenClassification", + "FlaxElectraModel", + "FlaxElectraPreTrainedModel", +] diff --git a/transformers/src/transformers/models/electra/modeling_tf_electra.py b/transformers/src/transformers/models/electra/modeling_tf_electra.py new file mode 100644 index 0000000000000000000000000000000000000000..b3af0f3afc42822df579672e3339aab2fe77bd01 --- /dev/null +++ b/transformers/src/transformers/models/electra/modeling_tf_electra.py @@ -0,0 +1,1776 @@ +# coding=utf-8 +# Copyright 2019 The Google AI Language Team Authors and The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""TF Electra model.""" + +from __future__ import annotations + +import math +import warnings +from dataclasses import dataclass +from typing import Optional, Union + +import numpy as np +import tensorflow as tf + +from ...activations_tf import get_tf_activation +from ...modeling_tf_outputs import ( + TFBaseModelOutputWithPastAndCrossAttentions, + TFMaskedLMOutput, + TFMultipleChoiceModelOutput, + TFQuestionAnsweringModelOutput, + TFSequenceClassifierOutput, + TFTokenClassifierOutput, +) +from ...modeling_tf_utils import ( + TFMaskedLanguageModelingLoss, + TFModelInputType, + TFMultipleChoiceLoss, + TFPreTrainedModel, + TFQuestionAnsweringLoss, + TFSequenceClassificationLoss, + TFSequenceSummary, + TFTokenClassificationLoss, + get_initializer, + keras, + keras_serializable, + unpack_inputs, +) +from ...tf_utils import check_embeddings_within_bounds, shape_list, stable_softmax +from ...utils import ( + ModelOutput, + add_code_sample_docstrings, + add_start_docstrings, + add_start_docstrings_to_model_forward, + logging, + replace_return_docstrings, +) +from .configuration_electra import ElectraConfig + + +logger = logging.get_logger(__name__) + +_CHECKPOINT_FOR_DOC = "google/electra-small-discriminator" +_CONFIG_FOR_DOC = "ElectraConfig" + + +# Copied from transformers.models.bert.modeling_tf_bert.TFBertSelfAttention with Bert->Electra +class TFElectraSelfAttention(keras.layers.Layer): + def __init__(self, config: ElectraConfig, **kwargs): + super().__init__(**kwargs) + + if config.hidden_size % config.num_attention_heads != 0: + raise ValueError( + f"The hidden size ({config.hidden_size}) is not a multiple of the number " + f"of attention heads ({config.num_attention_heads})" + ) + + self.num_attention_heads = config.num_attention_heads + self.attention_head_size = int(config.hidden_size / config.num_attention_heads) + self.all_head_size = self.num_attention_heads * self.attention_head_size + self.sqrt_att_head_size = math.sqrt(self.attention_head_size) + + self.query = keras.layers.Dense( + units=self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="query" + ) + self.key = keras.layers.Dense( + units=self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="key" + ) + self.value = keras.layers.Dense( + units=self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="value" + ) + self.dropout = keras.layers.Dropout(rate=config.attention_probs_dropout_prob) + + self.is_decoder = config.is_decoder + self.config = config + + def transpose_for_scores(self, tensor: tf.Tensor, batch_size: int) -> tf.Tensor: + # Reshape from [batch_size, seq_length, all_head_size] to [batch_size, seq_length, num_attention_heads, attention_head_size] + tensor = tf.reshape(tensor=tensor, shape=(batch_size, -1, self.num_attention_heads, self.attention_head_size)) + + # Transpose the tensor from [batch_size, seq_length, num_attention_heads, attention_head_size] to [batch_size, num_attention_heads, seq_length, attention_head_size] + return tf.transpose(tensor, perm=[0, 2, 1, 3]) + + def call( + self, + hidden_states: tf.Tensor, + attention_mask: tf.Tensor, + head_mask: tf.Tensor, + encoder_hidden_states: tf.Tensor, + encoder_attention_mask: tf.Tensor, + past_key_value: tuple[tf.Tensor], + output_attentions: bool, + training: bool = False, + ) -> tuple[tf.Tensor]: + batch_size = shape_list(hidden_states)[0] + mixed_query_layer = self.query(inputs=hidden_states) + + # If this is instantiated as a cross-attention module, the keys + # and values come from an encoder; the attention mask needs to be + # such that the encoder's padding tokens are not attended to. + is_cross_attention = encoder_hidden_states is not None + + if is_cross_attention and past_key_value is not None: + # reuse k,v, cross_attentions + key_layer = past_key_value[0] + value_layer = past_key_value[1] + attention_mask = encoder_attention_mask + elif is_cross_attention: + key_layer = self.transpose_for_scores(self.key(inputs=encoder_hidden_states), batch_size) + value_layer = self.transpose_for_scores(self.value(inputs=encoder_hidden_states), batch_size) + attention_mask = encoder_attention_mask + elif past_key_value is not None: + key_layer = self.transpose_for_scores(self.key(inputs=hidden_states), batch_size) + value_layer = self.transpose_for_scores(self.value(inputs=hidden_states), batch_size) + key_layer = tf.concat([past_key_value[0], key_layer], axis=2) + value_layer = tf.concat([past_key_value[1], value_layer], axis=2) + else: + key_layer = self.transpose_for_scores(self.key(inputs=hidden_states), batch_size) + value_layer = self.transpose_for_scores(self.value(inputs=hidden_states), batch_size) + + query_layer = self.transpose_for_scores(mixed_query_layer, batch_size) + + if self.is_decoder: + # if cross_attention save Tuple(tf.Tensor, tf.Tensor) of all cross attention key/value_states. + # Further calls to cross_attention layer can then reuse all cross-attention + # key/value_states (first "if" case) + # if uni-directional self-attention (decoder) save Tuple(tf.Tensor, tf.Tensor) of + # all previous decoder key/value_states. Further calls to uni-directional self-attention + # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case) + # if encoder bi-directional self-attention `past_key_value` is always `None` + past_key_value = (key_layer, value_layer) + + # Take the dot product between "query" and "key" to get the raw attention scores. + # (batch size, num_heads, seq_len_q, seq_len_k) + attention_scores = tf.matmul(query_layer, key_layer, transpose_b=True) + dk = tf.cast(self.sqrt_att_head_size, dtype=attention_scores.dtype) + attention_scores = tf.divide(attention_scores, dk) + + if attention_mask is not None: + # Apply the attention mask is (precomputed for all layers in TFElectraModel call() function) + attention_scores = tf.add(attention_scores, attention_mask) + + # Normalize the attention scores to probabilities. + attention_probs = stable_softmax(logits=attention_scores, axis=-1) + + # This is actually dropping out entire tokens to attend to, which might + # seem a bit unusual, but is taken from the original Transformer paper. + attention_probs = self.dropout(inputs=attention_probs, training=training) + + # Mask heads if we want to + if head_mask is not None: + attention_probs = tf.multiply(attention_probs, head_mask) + + attention_output = tf.matmul(attention_probs, value_layer) + attention_output = tf.transpose(attention_output, perm=[0, 2, 1, 3]) + + # (batch_size, seq_len_q, all_head_size) + attention_output = tf.reshape(tensor=attention_output, shape=(batch_size, -1, self.all_head_size)) + outputs = (attention_output, attention_probs) if output_attentions else (attention_output,) + + if self.is_decoder: + outputs = outputs + (past_key_value,) + return outputs + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "query", None) is not None: + with tf.name_scope(self.query.name): + self.query.build([None, None, self.config.hidden_size]) + if getattr(self, "key", None) is not None: + with tf.name_scope(self.key.name): + self.key.build([None, None, self.config.hidden_size]) + if getattr(self, "value", None) is not None: + with tf.name_scope(self.value.name): + self.value.build([None, None, self.config.hidden_size]) + + +# Copied from transformers.models.bert.modeling_tf_bert.TFBertSelfOutput with Bert->Electra +class TFElectraSelfOutput(keras.layers.Layer): + def __init__(self, config: ElectraConfig, **kwargs): + super().__init__(**kwargs) + + self.dense = keras.layers.Dense( + units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" + ) + self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") + self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob) + self.config = config + + def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor: + hidden_states = self.dense(inputs=hidden_states) + hidden_states = self.dropout(inputs=hidden_states, training=training) + hidden_states = self.LayerNorm(inputs=hidden_states + input_tensor) + + return hidden_states + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "dense", None) is not None: + with tf.name_scope(self.dense.name): + self.dense.build([None, None, self.config.hidden_size]) + if getattr(self, "LayerNorm", None) is not None: + with tf.name_scope(self.LayerNorm.name): + self.LayerNorm.build([None, None, self.config.hidden_size]) + + +# Copied from transformers.models.bert.modeling_tf_bert.TFBertAttention with Bert->Electra +class TFElectraAttention(keras.layers.Layer): + def __init__(self, config: ElectraConfig, **kwargs): + super().__init__(**kwargs) + + self.self_attention = TFElectraSelfAttention(config, name="self") + self.dense_output = TFElectraSelfOutput(config, name="output") + + def prune_heads(self, heads): + raise NotImplementedError + + def call( + self, + input_tensor: tf.Tensor, + attention_mask: tf.Tensor, + head_mask: tf.Tensor, + encoder_hidden_states: tf.Tensor, + encoder_attention_mask: tf.Tensor, + past_key_value: tuple[tf.Tensor], + output_attentions: bool, + training: bool = False, + ) -> tuple[tf.Tensor]: + self_outputs = self.self_attention( + hidden_states=input_tensor, + attention_mask=attention_mask, + head_mask=head_mask, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_attention_mask, + past_key_value=past_key_value, + output_attentions=output_attentions, + training=training, + ) + attention_output = self.dense_output( + hidden_states=self_outputs[0], input_tensor=input_tensor, training=training + ) + # add attentions (possibly with past_key_value) if we output them + outputs = (attention_output,) + self_outputs[1:] + + return outputs + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "self_attention", None) is not None: + with tf.name_scope(self.self_attention.name): + self.self_attention.build(None) + if getattr(self, "dense_output", None) is not None: + with tf.name_scope(self.dense_output.name): + self.dense_output.build(None) + + +# Copied from transformers.models.bert.modeling_tf_bert.TFBertIntermediate with Bert->Electra +class TFElectraIntermediate(keras.layers.Layer): + def __init__(self, config: ElectraConfig, **kwargs): + super().__init__(**kwargs) + + self.dense = keras.layers.Dense( + units=config.intermediate_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" + ) + + if isinstance(config.hidden_act, str): + self.intermediate_act_fn = get_tf_activation(config.hidden_act) + else: + self.intermediate_act_fn = config.hidden_act + self.config = config + + def call(self, hidden_states: tf.Tensor) -> tf.Tensor: + hidden_states = self.dense(inputs=hidden_states) + hidden_states = self.intermediate_act_fn(hidden_states) + + return hidden_states + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "dense", None) is not None: + with tf.name_scope(self.dense.name): + self.dense.build([None, None, self.config.hidden_size]) + + +# Copied from transformers.models.bert.modeling_tf_bert.TFBertOutput with Bert->Electra +class TFElectraOutput(keras.layers.Layer): + def __init__(self, config: ElectraConfig, **kwargs): + super().__init__(**kwargs) + + self.dense = keras.layers.Dense( + units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" + ) + self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") + self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob) + self.config = config + + def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor: + hidden_states = self.dense(inputs=hidden_states) + hidden_states = self.dropout(inputs=hidden_states, training=training) + hidden_states = self.LayerNorm(inputs=hidden_states + input_tensor) + + return hidden_states + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "dense", None) is not None: + with tf.name_scope(self.dense.name): + self.dense.build([None, None, self.config.intermediate_size]) + if getattr(self, "LayerNorm", None) is not None: + with tf.name_scope(self.LayerNorm.name): + self.LayerNorm.build([None, None, self.config.hidden_size]) + + +# Copied from transformers.models.bert.modeling_tf_bert.TFBertLayer with Bert->Electra +class TFElectraLayer(keras.layers.Layer): + def __init__(self, config: ElectraConfig, **kwargs): + super().__init__(**kwargs) + + self.attention = TFElectraAttention(config, name="attention") + self.is_decoder = config.is_decoder + self.add_cross_attention = config.add_cross_attention + if self.add_cross_attention: + if not self.is_decoder: + raise ValueError(f"{self} should be used as a decoder model if cross attention is added") + self.crossattention = TFElectraAttention(config, name="crossattention") + self.intermediate = TFElectraIntermediate(config, name="intermediate") + self.bert_output = TFElectraOutput(config, name="output") + + def call( + self, + hidden_states: tf.Tensor, + attention_mask: tf.Tensor, + head_mask: tf.Tensor, + encoder_hidden_states: tf.Tensor | None, + encoder_attention_mask: tf.Tensor | None, + past_key_value: tuple[tf.Tensor] | None, + output_attentions: bool, + training: bool = False, + ) -> tuple[tf.Tensor]: + # decoder uni-directional self-attention cached key/values tuple is at positions 1,2 + self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None + self_attention_outputs = self.attention( + input_tensor=hidden_states, + attention_mask=attention_mask, + head_mask=head_mask, + encoder_hidden_states=None, + encoder_attention_mask=None, + past_key_value=self_attn_past_key_value, + output_attentions=output_attentions, + training=training, + ) + attention_output = self_attention_outputs[0] + + # if decoder, the last output is tuple of self-attn cache + if self.is_decoder: + outputs = self_attention_outputs[1:-1] + present_key_value = self_attention_outputs[-1] + else: + outputs = self_attention_outputs[1:] # add self attentions if we output attention weights + + cross_attn_present_key_value = None + if self.is_decoder and encoder_hidden_states is not None: + if not hasattr(self, "crossattention"): + raise ValueError( + f"If `encoder_hidden_states` are passed, {self} has to be instantiated with cross-attention layers" + " by setting `config.add_cross_attention=True`" + ) + + # cross_attn cached key/values tuple is at positions 3,4 of past_key_value tuple + cross_attn_past_key_value = past_key_value[-2:] if past_key_value is not None else None + cross_attention_outputs = self.crossattention( + input_tensor=attention_output, + attention_mask=attention_mask, + head_mask=head_mask, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_attention_mask, + past_key_value=cross_attn_past_key_value, + output_attentions=output_attentions, + training=training, + ) + attention_output = cross_attention_outputs[0] + outputs = outputs + cross_attention_outputs[1:-1] # add cross attentions if we output attention weights + + # add cross-attn cache to positions 3,4 of present_key_value tuple + cross_attn_present_key_value = cross_attention_outputs[-1] + present_key_value = present_key_value + cross_attn_present_key_value + + intermediate_output = self.intermediate(hidden_states=attention_output) + layer_output = self.bert_output( + hidden_states=intermediate_output, input_tensor=attention_output, training=training + ) + outputs = (layer_output,) + outputs # add attentions if we output them + + # if decoder, return the attn key/values as the last output + if self.is_decoder: + outputs = outputs + (present_key_value,) + + return outputs + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "attention", None) is not None: + with tf.name_scope(self.attention.name): + self.attention.build(None) + if getattr(self, "intermediate", None) is not None: + with tf.name_scope(self.intermediate.name): + self.intermediate.build(None) + if getattr(self, "bert_output", None) is not None: + with tf.name_scope(self.bert_output.name): + self.bert_output.build(None) + if getattr(self, "crossattention", None) is not None: + with tf.name_scope(self.crossattention.name): + self.crossattention.build(None) + + +# Copied from transformers.models.bert.modeling_tf_bert.TFBertEncoder with Bert->Electra +class TFElectraEncoder(keras.layers.Layer): + def __init__(self, config: ElectraConfig, **kwargs): + super().__init__(**kwargs) + self.config = config + self.layer = [TFElectraLayer(config, name=f"layer_._{i}") for i in range(config.num_hidden_layers)] + + def call( + self, + hidden_states: tf.Tensor, + attention_mask: tf.Tensor, + head_mask: tf.Tensor, + encoder_hidden_states: tf.Tensor | None, + encoder_attention_mask: tf.Tensor | None, + past_key_values: tuple[tuple[tf.Tensor]] | None, + use_cache: Optional[bool], + output_attentions: bool, + output_hidden_states: bool, + return_dict: bool, + training: bool = False, + ) -> Union[TFBaseModelOutputWithPastAndCrossAttentions, tuple[tf.Tensor]]: + all_hidden_states = () if output_hidden_states else None + all_attentions = () if output_attentions else None + all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None + + next_decoder_cache = () if use_cache else None + for i, layer_module in enumerate(self.layer): + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + past_key_value = past_key_values[i] if past_key_values is not None else None + + layer_outputs = layer_module( + hidden_states=hidden_states, + attention_mask=attention_mask, + head_mask=head_mask[i], + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_attention_mask, + past_key_value=past_key_value, + output_attentions=output_attentions, + training=training, + ) + hidden_states = layer_outputs[0] + + if use_cache: + next_decoder_cache += (layer_outputs[-1],) + + if output_attentions: + all_attentions = all_attentions + (layer_outputs[1],) + if self.config.add_cross_attention and encoder_hidden_states is not None: + all_cross_attentions = all_cross_attentions + (layer_outputs[2],) + + # Add last layer + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + if not return_dict: + return tuple( + v for v in [hidden_states, all_hidden_states, all_attentions, all_cross_attentions] if v is not None + ) + + return TFBaseModelOutputWithPastAndCrossAttentions( + last_hidden_state=hidden_states, + past_key_values=next_decoder_cache, + hidden_states=all_hidden_states, + attentions=all_attentions, + cross_attentions=all_cross_attentions, + ) + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "layer", None) is not None: + for layer in self.layer: + with tf.name_scope(layer.name): + layer.build(None) + + +# Copied from transformers.models.bert.modeling_tf_bert.TFBertPooler with Bert->Electra +class TFElectraPooler(keras.layers.Layer): + def __init__(self, config: ElectraConfig, **kwargs): + super().__init__(**kwargs) + + self.dense = keras.layers.Dense( + units=config.hidden_size, + kernel_initializer=get_initializer(config.initializer_range), + activation="tanh", + name="dense", + ) + self.config = config + + def call(self, hidden_states: tf.Tensor) -> tf.Tensor: + # We "pool" the model by simply taking the hidden state corresponding + # to the first token. + first_token_tensor = hidden_states[:, 0] + pooled_output = self.dense(inputs=first_token_tensor) + + return pooled_output + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "dense", None) is not None: + with tf.name_scope(self.dense.name): + self.dense.build([None, None, self.config.hidden_size]) + + +# Copied from transformers.models.albert.modeling_tf_albert.TFAlbertEmbeddings with Albert->Electra +class TFElectraEmbeddings(keras.layers.Layer): + """Construct the embeddings from word, position and token_type embeddings.""" + + def __init__(self, config: ElectraConfig, **kwargs): + super().__init__(**kwargs) + + self.config = config + self.embedding_size = config.embedding_size + self.max_position_embeddings = config.max_position_embeddings + self.initializer_range = config.initializer_range + self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") + self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob) + + def build(self, input_shape=None): + with tf.name_scope("word_embeddings"): + self.weight = self.add_weight( + name="weight", + shape=[self.config.vocab_size, self.embedding_size], + initializer=get_initializer(self.initializer_range), + ) + + with tf.name_scope("token_type_embeddings"): + self.token_type_embeddings = self.add_weight( + name="embeddings", + shape=[self.config.type_vocab_size, self.embedding_size], + initializer=get_initializer(self.initializer_range), + ) + + with tf.name_scope("position_embeddings"): + self.position_embeddings = self.add_weight( + name="embeddings", + shape=[self.max_position_embeddings, self.embedding_size], + initializer=get_initializer(self.initializer_range), + ) + + if self.built: + return + self.built = True + if getattr(self, "LayerNorm", None) is not None: + with tf.name_scope(self.LayerNorm.name): + self.LayerNorm.build([None, None, self.config.embedding_size]) + + # Copied from transformers.models.bert.modeling_tf_bert.TFBertEmbeddings.call + def call( + self, + input_ids: Optional[tf.Tensor] = None, + position_ids: Optional[tf.Tensor] = None, + token_type_ids: Optional[tf.Tensor] = None, + inputs_embeds: Optional[tf.Tensor] = None, + past_key_values_length=0, + training: bool = False, + ) -> tf.Tensor: + """ + Applies embedding based on inputs tensor. + + Returns: + final_embeddings (`tf.Tensor`): output embedding tensor. + """ + if input_ids is None and inputs_embeds is None: + raise ValueError("Need to provide either `input_ids` or `input_embeds`.") + + if input_ids is not None: + check_embeddings_within_bounds(input_ids, self.config.vocab_size) + inputs_embeds = tf.gather(params=self.weight, indices=input_ids) + + input_shape = shape_list(inputs_embeds)[:-1] + + if token_type_ids is None: + token_type_ids = tf.fill(dims=input_shape, value=0) + + if position_ids is None: + position_ids = tf.expand_dims( + tf.range(start=past_key_values_length, limit=input_shape[1] + past_key_values_length), axis=0 + ) + + position_embeds = tf.gather(params=self.position_embeddings, indices=position_ids) + token_type_embeds = tf.gather(params=self.token_type_embeddings, indices=token_type_ids) + final_embeddings = inputs_embeds + position_embeds + token_type_embeds + final_embeddings = self.LayerNorm(inputs=final_embeddings) + final_embeddings = self.dropout(inputs=final_embeddings, training=training) + + return final_embeddings + + +class TFElectraDiscriminatorPredictions(keras.layers.Layer): + def __init__(self, config, **kwargs): + super().__init__(**kwargs) + + self.dense = keras.layers.Dense(config.hidden_size, name="dense") + self.dense_prediction = keras.layers.Dense(1, name="dense_prediction") + self.config = config + + def call(self, discriminator_hidden_states, training=False): + hidden_states = self.dense(discriminator_hidden_states) + hidden_states = get_tf_activation(self.config.hidden_act)(hidden_states) + logits = tf.squeeze(self.dense_prediction(hidden_states), -1) + + return logits + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "dense", None) is not None: + with tf.name_scope(self.dense.name): + self.dense.build([None, None, self.config.hidden_size]) + if getattr(self, "dense_prediction", None) is not None: + with tf.name_scope(self.dense_prediction.name): + self.dense_prediction.build([None, None, self.config.hidden_size]) + + +class TFElectraGeneratorPredictions(keras.layers.Layer): + def __init__(self, config, **kwargs): + super().__init__(**kwargs) + + self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") + self.dense = keras.layers.Dense(config.embedding_size, name="dense") + self.config = config + + def call(self, generator_hidden_states, training=False): + hidden_states = self.dense(generator_hidden_states) + hidden_states = get_tf_activation("gelu")(hidden_states) + hidden_states = self.LayerNorm(hidden_states) + + return hidden_states + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "LayerNorm", None) is not None: + with tf.name_scope(self.LayerNorm.name): + self.LayerNorm.build([None, None, self.config.embedding_size]) + if getattr(self, "dense", None) is not None: + with tf.name_scope(self.dense.name): + self.dense.build([None, None, self.config.hidden_size]) + + +class TFElectraPreTrainedModel(TFPreTrainedModel): + """ + An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained + models. + """ + + config_class = ElectraConfig + base_model_prefix = "electra" + # When the model is loaded from a PT model + _keys_to_ignore_on_load_unexpected = [r"generator_lm_head.weight"] + _keys_to_ignore_on_load_missing = [r"dropout"] + + +@keras_serializable +class TFElectraMainLayer(keras.layers.Layer): + config_class = ElectraConfig + + def __init__(self, config, **kwargs): + super().__init__(**kwargs) + + self.config = config + self.is_decoder = config.is_decoder + + self.embeddings = TFElectraEmbeddings(config, name="embeddings") + + if config.embedding_size != config.hidden_size: + self.embeddings_project = keras.layers.Dense(config.hidden_size, name="embeddings_project") + + self.encoder = TFElectraEncoder(config, name="encoder") + + def get_input_embeddings(self): + return self.embeddings + + def set_input_embeddings(self, value): + self.embeddings.weight = value + self.embeddings.vocab_size = shape_list(value)[0] + + def _prune_heads(self, heads_to_prune): + """ + Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base + class PreTrainedModel + """ + raise NotImplementedError + + def get_extended_attention_mask(self, attention_mask, input_shape, dtype, past_key_values_length=0): + batch_size, seq_length = input_shape + + if attention_mask is None: + attention_mask = tf.fill(dims=(batch_size, seq_length + past_key_values_length), value=1) + + # We create a 3D attention mask from a 2D tensor mask. + # Sizes are [batch_size, 1, 1, to_seq_length] + # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length] + # this attention mask is more simple than the triangular masking of causal attention + # used in OpenAI GPT, we just need to prepare the broadcast dimension here. + attention_mask_shape = shape_list(attention_mask) + + mask_seq_length = seq_length + past_key_values_length + # Copied from `modeling_tf_t5.py` + # Provided a padding mask of dimensions [batch_size, mask_seq_length] + # - if the model is a decoder, apply a causal mask in addition to the padding mask + # - if the model is an encoder, make the mask broadcastable to [batch_size, num_heads, mask_seq_length, mask_seq_length] + if self.is_decoder: + seq_ids = tf.range(mask_seq_length) + causal_mask = tf.less_equal( + tf.tile(seq_ids[None, None, :], (batch_size, mask_seq_length, 1)), + seq_ids[None, :, None], + ) + causal_mask = tf.cast(causal_mask, dtype=attention_mask.dtype) + extended_attention_mask = causal_mask * attention_mask[:, None, :] + attention_mask_shape = shape_list(extended_attention_mask) + extended_attention_mask = tf.reshape( + extended_attention_mask, (attention_mask_shape[0], 1, attention_mask_shape[1], attention_mask_shape[2]) + ) + if past_key_values_length > 0: + extended_attention_mask = extended_attention_mask[:, :, -seq_length:, :] + else: + extended_attention_mask = tf.reshape( + attention_mask, (attention_mask_shape[0], 1, 1, attention_mask_shape[1]) + ) + + # Since attention_mask is 1.0 for positions we want to attend and 0.0 for + # masked positions, this operation will create a tensor which is 0.0 for + # positions we want to attend and -10000.0 for masked positions. + # Since we are adding it to the raw scores before the softmax, this is + # effectively the same as removing these entirely. + extended_attention_mask = tf.cast(extended_attention_mask, dtype=dtype) + one_cst = tf.constant(1.0, dtype=dtype) + ten_thousand_cst = tf.constant(-10000.0, dtype=dtype) + extended_attention_mask = tf.multiply(tf.subtract(one_cst, extended_attention_mask), ten_thousand_cst) + + return extended_attention_mask + + def get_head_mask(self, head_mask): + if head_mask is not None: + raise NotImplementedError + else: + head_mask = [None] * self.config.num_hidden_layers + + return head_mask + + @unpack_inputs + def call( + self, + input_ids: TFModelInputType | None = None, + attention_mask: np.ndarray | tf.Tensor | None = None, + token_type_ids: np.ndarray | tf.Tensor | None = None, + position_ids: np.ndarray | tf.Tensor | None = None, + head_mask: np.ndarray | tf.Tensor | None = None, + inputs_embeds: np.ndarray | tf.Tensor | None = None, + encoder_hidden_states: np.ndarray | tf.Tensor | None = None, + encoder_attention_mask: np.ndarray | tf.Tensor | None = None, + past_key_values: Optional[tuple[tuple[Union[np.ndarray, tf.Tensor]]]] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + training: Optional[bool] = False, + ) -> Union[TFBaseModelOutputWithPastAndCrossAttentions, tuple[tf.Tensor]]: + if not self.config.is_decoder: + use_cache = False + + if input_ids is not None and inputs_embeds is not None: + raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") + elif input_ids is not None: + input_shape = shape_list(input_ids) + elif inputs_embeds is not None: + input_shape = shape_list(inputs_embeds)[:-1] + else: + raise ValueError("You have to specify either input_ids or inputs_embeds") + + batch_size, seq_length = input_shape + + if past_key_values is None: + past_key_values_length = 0 + past_key_values = [None] * len(self.encoder.layer) + else: + past_key_values_length = shape_list(past_key_values[0][0])[-2] + + if attention_mask is None: + attention_mask = tf.fill(dims=(batch_size, seq_length + past_key_values_length), value=1) + + if token_type_ids is None: + token_type_ids = tf.fill(dims=input_shape, value=0) + + hidden_states = self.embeddings( + input_ids=input_ids, + position_ids=position_ids, + token_type_ids=token_type_ids, + inputs_embeds=inputs_embeds, + past_key_values_length=past_key_values_length, + training=training, + ) + extended_attention_mask = self.get_extended_attention_mask( + attention_mask, input_shape, hidden_states.dtype, past_key_values_length + ) + + # Copied from `modeling_tf_t5.py` with -1e9 -> -10000 + if self.is_decoder and encoder_attention_mask is not None: + # If a 2D ou 3D attention mask is provided for the cross-attention + # we need to make broadcastable to [batch_size, num_heads, mask_seq_length, mask_seq_length] + # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length] + encoder_attention_mask = tf.cast(encoder_attention_mask, dtype=extended_attention_mask.dtype) + num_dims_encoder_attention_mask = len(shape_list(encoder_attention_mask)) + if num_dims_encoder_attention_mask == 3: + encoder_extended_attention_mask = encoder_attention_mask[:, None, :, :] + if num_dims_encoder_attention_mask == 2: + encoder_extended_attention_mask = encoder_attention_mask[:, None, None, :] + + # T5 has a mask that can compare sequence ids, we can simulate this here with this transposition + # Cf. https://github.com/tensorflow/mesh/blob/8d2465e9bc93129b913b5ccc6a59aa97abd96ec6/mesh_tensorflow/transformer/transformer_layers.py#L270 + # encoder_extended_attention_mask = tf.math.equal(encoder_extended_attention_mask, + # tf.transpose(encoder_extended_attention_mask, perm=(-1, -2))) + + encoder_extended_attention_mask = (1.0 - encoder_extended_attention_mask) * -10000.0 + else: + encoder_extended_attention_mask = None + + head_mask = self.get_head_mask(head_mask) + + if hasattr(self, "embeddings_project"): + hidden_states = self.embeddings_project(hidden_states, training=training) + + hidden_states = self.encoder( + hidden_states=hidden_states, + attention_mask=extended_attention_mask, + head_mask=head_mask, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_extended_attention_mask, + past_key_values=past_key_values, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + training=training, + ) + + return hidden_states + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "embeddings", None) is not None: + with tf.name_scope(self.embeddings.name): + self.embeddings.build(None) + if getattr(self, "encoder", None) is not None: + with tf.name_scope(self.encoder.name): + self.encoder.build(None) + if getattr(self, "embeddings_project", None) is not None: + with tf.name_scope(self.embeddings_project.name): + self.embeddings_project.build([None, None, self.config.embedding_size]) + + +@dataclass +class TFElectraForPreTrainingOutput(ModelOutput): + """ + Output type of [`TFElectraForPreTraining`]. + + Args: + loss (*optional*, returned when `labels` is provided, `tf.Tensor` of shape `(1,)`): + Total loss of the ELECTRA objective. + logits (`tf.Tensor` of shape `(batch_size, sequence_length)`): + Prediction scores of the head (scores for each token before SoftMax). + hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape + `(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + """ + + logits: Optional[tf.Tensor] = None + hidden_states: tuple[tf.Tensor] | None = None + attentions: tuple[tf.Tensor] | None = None + + +ELECTRA_START_DOCSTRING = r""" + + This model inherits from [`TFPreTrainedModel`]. Check the superclass documentation for the generic methods the + library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads + etc.) + + This model is also a [keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it + as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and + behavior. + + + + TensorFlow models and layers in `transformers` accept two formats as input: + + - having all inputs as keyword arguments (like PyTorch models), or + - having all inputs as a list, tuple or dict in the first positional argument. + + The reason the second format is supported is that Keras methods prefer this format when passing inputs to models + and layers. Because of this support, when using methods like `model.fit()` things should "just work" for you - just + pass your inputs and labels in any format that `model.fit()` supports! If, however, you want to use the second + format outside of Keras methods like `fit()` and `predict()`, such as when creating your own layers or models with + the Keras `Functional` API, there are three possibilities you can use to gather all the input Tensors in the first + positional argument: + + - a single Tensor with `input_ids` only and nothing else: `model(input_ids)` + - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring: + `model([input_ids, attention_mask])` or `model([input_ids, attention_mask, token_type_ids])` + - a dictionary with one or several input Tensors associated to the input names given in the docstring: + `model({"input_ids": input_ids, "token_type_ids": token_type_ids})` + + Note that when creating models and layers with + [subclassing](https://keras.io/guides/making_new_layers_and_models_via_subclassing/) then you don't need to worry + about any of this, as you can just pass inputs like you would to any other Python function! + + + + Parameters: + config ([`ElectraConfig`]): Model configuration class with all the parameters of the model. + Initializing with a config file does not load the weights associated with the model, only the + configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights. +""" + +ELECTRA_INPUTS_DOCSTRING = r""" + Args: + input_ids (`Numpy array` or `tf.Tensor` of shape `({0})`): + Indices of input sequence tokens in the vocabulary. + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.__call__`] and + [`PreTrainedTokenizer.encode`] for details. + + [What are input IDs?](../glossary#input-ids) + attention_mask (`Numpy array` or `tf.Tensor` of shape `({0})`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + position_ids (`Numpy array` or `tf.Tensor` of shape `({0})`, *optional*): + Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, + config.max_position_embeddings - 1]`. + + [What are position IDs?](../glossary#position-ids) + head_mask (`Numpy array` or `tf.Tensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*): + Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`: + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + + inputs_embeds (`tf.Tensor` of shape `({0}, hidden_size)`, *optional*): + Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This + is useful if you want more control over how to convert `input_ids` indices into associated vectors than the + model's internal embedding lookup matrix. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned + tensors for more detail. This argument can be used only in eager mode, in graph mode the value in the + config will be used instead. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. This argument can be used only in eager mode, in graph mode the value in the config will be + used instead. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. This argument can be used in + eager mode, in graph mode the value will always be set to True. + training (`bool`, *optional*, defaults to `False`): + Whether or not to use the model in training mode (some modules like dropout modules have different + behaviors between training and evaluation). +""" + + +@add_start_docstrings( + "The bare Electra Model transformer outputting raw hidden-states without any specific head on top. Identical to " + "the BERT model except that it uses an additional linear layer between the embedding layer and the encoder if the " + "hidden size and embedding size are different. " + "" + "Both the generator and discriminator checkpoints may be loaded into this model.", + ELECTRA_START_DOCSTRING, +) +class TFElectraModel(TFElectraPreTrainedModel): + def __init__(self, config, *inputs, **kwargs): + super().__init__(config, *inputs, **kwargs) + + self.electra = TFElectraMainLayer(config, name="electra") + + @unpack_inputs + @add_start_docstrings_to_model_forward(ELECTRA_INPUTS_DOCSTRING.format("batch_size, sequence_length")) + @add_code_sample_docstrings( + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=TFBaseModelOutputWithPastAndCrossAttentions, + config_class=_CONFIG_FOR_DOC, + ) + def call( + self, + input_ids: TFModelInputType | None = None, + attention_mask: np.ndarray | tf.Tensor | None = None, + token_type_ids: np.ndarray | tf.Tensor | None = None, + position_ids: np.ndarray | tf.Tensor | None = None, + head_mask: np.ndarray | tf.Tensor | None = None, + inputs_embeds: np.ndarray | tf.Tensor | None = None, + encoder_hidden_states: np.ndarray | tf.Tensor | None = None, + encoder_attention_mask: np.ndarray | tf.Tensor | None = None, + past_key_values: Optional[tuple[tuple[Union[np.ndarray, tf.Tensor]]]] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + training: Optional[bool] = False, + ) -> Union[TFBaseModelOutputWithPastAndCrossAttentions, tuple[tf.Tensor]]: + r""" + encoder_hidden_states (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if + the model is configured as a decoder. + encoder_attention_mask (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in + the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + past_key_values (`tuple[tuple[tf.Tensor]]` of length `config.n_layers`) + contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding. + If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that + don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all + `decoder_input_ids` of shape `(batch_size, sequence_length)`. + use_cache (`bool`, *optional*, defaults to `True`): + If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see + `past_key_values`). Set to `False` during training, `True` during generation + """ + outputs = self.electra( + input_ids=input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_attention_mask, + past_key_values=past_key_values, + use_cache=use_cache, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + training=training, + ) + + return outputs + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "electra", None) is not None: + with tf.name_scope(self.electra.name): + self.electra.build(None) + + +@add_start_docstrings( + """ + Electra model with a binary classification head on top as used during pretraining for identifying generated tokens. + + Even though both the discriminator and generator may be loaded into this model, the discriminator is the only model + of the two to have the correct classification head to be used for this model. + """, + ELECTRA_START_DOCSTRING, +) +class TFElectraForPreTraining(TFElectraPreTrainedModel): + def __init__(self, config, **kwargs): + super().__init__(config, **kwargs) + + self.electra = TFElectraMainLayer(config, name="electra") + self.discriminator_predictions = TFElectraDiscriminatorPredictions(config, name="discriminator_predictions") + + @unpack_inputs + @add_start_docstrings_to_model_forward(ELECTRA_INPUTS_DOCSTRING.format("batch_size, sequence_length")) + @replace_return_docstrings(output_type=TFElectraForPreTrainingOutput, config_class=_CONFIG_FOR_DOC) + def call( + self, + input_ids: TFModelInputType | None = None, + attention_mask: np.ndarray | tf.Tensor | None = None, + token_type_ids: np.ndarray | tf.Tensor | None = None, + position_ids: np.ndarray | tf.Tensor | None = None, + head_mask: np.ndarray | tf.Tensor | None = None, + inputs_embeds: np.ndarray | tf.Tensor | None = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + training: Optional[bool] = False, + ) -> Union[TFElectraForPreTrainingOutput, tuple[tf.Tensor]]: + r""" + Returns: + + Examples: + + ```python + >>> import tensorflow as tf + >>> from transformers import AutoTokenizer, TFElectraForPreTraining + + >>> tokenizer = AutoTokenizer.from_pretrained("google/electra-small-discriminator") + >>> model = TFElectraForPreTraining.from_pretrained("google/electra-small-discriminator") + >>> input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :] # Batch size 1 + >>> outputs = model(input_ids) + >>> scores = outputs[0] + ```""" + discriminator_hidden_states = self.electra( + input_ids=input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + training=training, + ) + discriminator_sequence_output = discriminator_hidden_states[0] + logits = self.discriminator_predictions(discriminator_sequence_output) + + if not return_dict: + return (logits,) + discriminator_hidden_states[1:] + + return TFElectraForPreTrainingOutput( + logits=logits, + hidden_states=discriminator_hidden_states.hidden_states, + attentions=discriminator_hidden_states.attentions, + ) + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "electra", None) is not None: + with tf.name_scope(self.electra.name): + self.electra.build(None) + if getattr(self, "discriminator_predictions", None) is not None: + with tf.name_scope(self.discriminator_predictions.name): + self.discriminator_predictions.build(None) + + +class TFElectraMaskedLMHead(keras.layers.Layer): + def __init__(self, config, input_embeddings, **kwargs): + super().__init__(**kwargs) + + self.config = config + self.embedding_size = config.embedding_size + self.input_embeddings = input_embeddings + + def build(self, input_shape): + self.bias = self.add_weight(shape=(self.config.vocab_size,), initializer="zeros", trainable=True, name="bias") + + super().build(input_shape) + + def get_output_embeddings(self): + return self.input_embeddings + + def set_output_embeddings(self, value): + self.input_embeddings.weight = value + self.input_embeddings.vocab_size = shape_list(value)[0] + + def get_bias(self): + return {"bias": self.bias} + + def set_bias(self, value): + self.bias = value["bias"] + self.config.vocab_size = shape_list(value["bias"])[0] + + def call(self, hidden_states): + seq_length = shape_list(tensor=hidden_states)[1] + hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, self.embedding_size]) + hidden_states = tf.matmul(a=hidden_states, b=self.input_embeddings.weight, transpose_b=True) + hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, seq_length, self.config.vocab_size]) + hidden_states = tf.nn.bias_add(value=hidden_states, bias=self.bias) + + return hidden_states + + +@add_start_docstrings( + """ + Electra model with a language modeling head on top. + + Even though both the discriminator and generator may be loaded into this model, the generator is the only model of + the two to have been trained for the masked language modeling task. + """, + ELECTRA_START_DOCSTRING, +) +class TFElectraForMaskedLM(TFElectraPreTrainedModel, TFMaskedLanguageModelingLoss): + def __init__(self, config, **kwargs): + super().__init__(config, **kwargs) + + self.config = config + self.electra = TFElectraMainLayer(config, name="electra") + self.generator_predictions = TFElectraGeneratorPredictions(config, name="generator_predictions") + + if isinstance(config.hidden_act, str): + self.activation = get_tf_activation(config.hidden_act) + else: + self.activation = config.hidden_act + + self.generator_lm_head = TFElectraMaskedLMHead(config, self.electra.embeddings, name="generator_lm_head") + + def get_lm_head(self): + return self.generator_lm_head + + def get_prefix_bias_name(self): + warnings.warn("The method get_prefix_bias_name is deprecated. Please use `get_bias` instead.", FutureWarning) + return self.name + "/" + self.generator_lm_head.name + + @unpack_inputs + @add_start_docstrings_to_model_forward(ELECTRA_INPUTS_DOCSTRING.format("batch_size, sequence_length")) + @add_code_sample_docstrings( + checkpoint="google/electra-small-generator", + output_type=TFMaskedLMOutput, + config_class=_CONFIG_FOR_DOC, + mask="[MASK]", + expected_output="'paris'", + expected_loss=1.22, + ) + def call( + self, + input_ids: TFModelInputType | None = None, + attention_mask: np.ndarray | tf.Tensor | None = None, + token_type_ids: np.ndarray | tf.Tensor | None = None, + position_ids: np.ndarray | tf.Tensor | None = None, + head_mask: np.ndarray | tf.Tensor | None = None, + inputs_embeds: np.ndarray | tf.Tensor | None = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + labels: np.ndarray | tf.Tensor | None = None, + training: Optional[bool] = False, + ) -> Union[TFMaskedLMOutput, tuple[tf.Tensor]]: + r""" + labels (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ..., + config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the + loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]` + """ + generator_hidden_states = self.electra( + input_ids=input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + training=training, + ) + generator_sequence_output = generator_hidden_states[0] + prediction_scores = self.generator_predictions(generator_sequence_output, training=training) + prediction_scores = self.generator_lm_head(prediction_scores, training=training) + loss = None if labels is None else self.hf_compute_loss(labels, prediction_scores) + + if not return_dict: + output = (prediction_scores,) + generator_hidden_states[1:] + + return ((loss,) + output) if loss is not None else output + + return TFMaskedLMOutput( + loss=loss, + logits=prediction_scores, + hidden_states=generator_hidden_states.hidden_states, + attentions=generator_hidden_states.attentions, + ) + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "electra", None) is not None: + with tf.name_scope(self.electra.name): + self.electra.build(None) + if getattr(self, "generator_predictions", None) is not None: + with tf.name_scope(self.generator_predictions.name): + self.generator_predictions.build(None) + if getattr(self, "generator_lm_head", None) is not None: + with tf.name_scope(self.generator_lm_head.name): + self.generator_lm_head.build(None) + + +class TFElectraClassificationHead(keras.layers.Layer): + """Head for sentence-level classification tasks.""" + + def __init__(self, config, **kwargs): + super().__init__(**kwargs) + + self.dense = keras.layers.Dense( + config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" + ) + classifier_dropout = ( + config.classifhidden_dropout_probier_dropout + if config.classifier_dropout is not None + else config.hidden_dropout_prob + ) + self.dropout = keras.layers.Dropout(classifier_dropout) + self.out_proj = keras.layers.Dense( + config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="out_proj" + ) + self.config = config + + def call(self, inputs, **kwargs): + x = inputs[:, 0, :] # take token (equiv. to [CLS]) + x = self.dropout(x) + x = self.dense(x) + x = get_tf_activation("gelu")(x) # although BERT uses tanh here, it seems Electra authors used gelu here + x = self.dropout(x) + x = self.out_proj(x) + + return x + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "dense", None) is not None: + with tf.name_scope(self.dense.name): + self.dense.build([None, None, self.config.hidden_size]) + if getattr(self, "out_proj", None) is not None: + with tf.name_scope(self.out_proj.name): + self.out_proj.build([None, None, self.config.hidden_size]) + + +@add_start_docstrings( + """ + ELECTRA Model transformer with a sequence classification/regression head on top (a linear layer on top of the + pooled output) e.g. for GLUE tasks. + """, + ELECTRA_START_DOCSTRING, +) +class TFElectraForSequenceClassification(TFElectraPreTrainedModel, TFSequenceClassificationLoss): + def __init__(self, config, *inputs, **kwargs): + super().__init__(config, *inputs, **kwargs) + self.num_labels = config.num_labels + self.electra = TFElectraMainLayer(config, name="electra") + self.classifier = TFElectraClassificationHead(config, name="classifier") + + @unpack_inputs + @add_start_docstrings_to_model_forward(ELECTRA_INPUTS_DOCSTRING.format("batch_size, sequence_length")) + @add_code_sample_docstrings( + checkpoint="bhadresh-savani/electra-base-emotion", + output_type=TFSequenceClassifierOutput, + config_class=_CONFIG_FOR_DOC, + expected_output="'joy'", + expected_loss=0.06, + ) + def call( + self, + input_ids: TFModelInputType | None = None, + attention_mask: np.ndarray | tf.Tensor | None = None, + token_type_ids: np.ndarray | tf.Tensor | None = None, + position_ids: np.ndarray | tf.Tensor | None = None, + head_mask: np.ndarray | tf.Tensor | None = None, + inputs_embeds: np.ndarray | tf.Tensor | None = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + labels: np.ndarray | tf.Tensor | None = None, + training: Optional[bool] = False, + ) -> Union[TFSequenceClassifierOutput, tuple[tf.Tensor]]: + r""" + labels (`tf.Tensor` of shape `(batch_size,)`, *optional*): + Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., + config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If + `config.num_labels > 1` a classification loss is computed (Cross-Entropy). + """ + outputs = self.electra( + input_ids=input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + training=training, + ) + logits = self.classifier(outputs[0]) + loss = None if labels is None else self.hf_compute_loss(labels, logits) + + if not return_dict: + output = (logits,) + outputs[1:] + + return ((loss,) + output) if loss is not None else output + + return TFSequenceClassifierOutput( + loss=loss, + logits=logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "electra", None) is not None: + with tf.name_scope(self.electra.name): + self.electra.build(None) + if getattr(self, "classifier", None) is not None: + with tf.name_scope(self.classifier.name): + self.classifier.build(None) + + +@add_start_docstrings( + """ + ELECTRA Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a + softmax) e.g. for RocStories/SWAG tasks. + """, + ELECTRA_START_DOCSTRING, +) +class TFElectraForMultipleChoice(TFElectraPreTrainedModel, TFMultipleChoiceLoss): + def __init__(self, config, *inputs, **kwargs): + super().__init__(config, *inputs, **kwargs) + + self.electra = TFElectraMainLayer(config, name="electra") + self.sequence_summary = TFSequenceSummary( + config, initializer_range=config.initializer_range, name="sequence_summary" + ) + self.classifier = keras.layers.Dense( + 1, kernel_initializer=get_initializer(config.initializer_range), name="classifier" + ) + self.config = config + + @unpack_inputs + @add_start_docstrings_to_model_forward(ELECTRA_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length")) + @add_code_sample_docstrings( + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=TFMultipleChoiceModelOutput, + config_class=_CONFIG_FOR_DOC, + ) + def call( + self, + input_ids: TFModelInputType | None = None, + attention_mask: np.ndarray | tf.Tensor | None = None, + token_type_ids: np.ndarray | tf.Tensor | None = None, + position_ids: np.ndarray | tf.Tensor | None = None, + head_mask: np.ndarray | tf.Tensor | None = None, + inputs_embeds: np.ndarray | tf.Tensor | None = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + labels: np.ndarray | tf.Tensor | None = None, + training: Optional[bool] = False, + ) -> Union[TFMultipleChoiceModelOutput, tuple[tf.Tensor]]: + r""" + labels (`tf.Tensor` of shape `(batch_size,)`, *optional*): + Labels for computing the multiple choice classification loss. Indices should be in `[0, ..., num_choices]` + where `num_choices` is the size of the second dimension of the input tensors. (See `input_ids` above) + """ + + if input_ids is not None: + num_choices = shape_list(input_ids)[1] + seq_length = shape_list(input_ids)[2] + else: + num_choices = shape_list(inputs_embeds)[1] + seq_length = shape_list(inputs_embeds)[2] + + flat_input_ids = tf.reshape(input_ids, (-1, seq_length)) if input_ids is not None else None + flat_attention_mask = tf.reshape(attention_mask, (-1, seq_length)) if attention_mask is not None else None + flat_token_type_ids = tf.reshape(token_type_ids, (-1, seq_length)) if token_type_ids is not None else None + flat_position_ids = tf.reshape(position_ids, (-1, seq_length)) if position_ids is not None else None + flat_inputs_embeds = ( + tf.reshape(inputs_embeds, (-1, seq_length, shape_list(inputs_embeds)[3])) + if inputs_embeds is not None + else None + ) + outputs = self.electra( + input_ids=flat_input_ids, + attention_mask=flat_attention_mask, + token_type_ids=flat_token_type_ids, + position_ids=flat_position_ids, + head_mask=head_mask, + inputs_embeds=flat_inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + training=training, + ) + logits = self.sequence_summary(outputs[0]) + logits = self.classifier(logits) + reshaped_logits = tf.reshape(logits, (-1, num_choices)) + loss = None if labels is None else self.hf_compute_loss(labels, reshaped_logits) + + if not return_dict: + output = (reshaped_logits,) + outputs[1:] + + return ((loss,) + output) if loss is not None else output + + return TFMultipleChoiceModelOutput( + loss=loss, + logits=reshaped_logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "electra", None) is not None: + with tf.name_scope(self.electra.name): + self.electra.build(None) + if getattr(self, "sequence_summary", None) is not None: + with tf.name_scope(self.sequence_summary.name): + self.sequence_summary.build(None) + if getattr(self, "classifier", None) is not None: + with tf.name_scope(self.classifier.name): + self.classifier.build([None, None, self.config.hidden_size]) + + +@add_start_docstrings( + """ + Electra model with a token classification head on top. + + Both the discriminator and generator may be loaded into this model. + """, + ELECTRA_START_DOCSTRING, +) +class TFElectraForTokenClassification(TFElectraPreTrainedModel, TFTokenClassificationLoss): + def __init__(self, config, **kwargs): + super().__init__(config, **kwargs) + + self.electra = TFElectraMainLayer(config, name="electra") + classifier_dropout = ( + config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob + ) + self.dropout = keras.layers.Dropout(classifier_dropout) + self.classifier = keras.layers.Dense( + config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier" + ) + self.config = config + + @unpack_inputs + @add_start_docstrings_to_model_forward(ELECTRA_INPUTS_DOCSTRING.format("batch_size, sequence_length")) + @add_code_sample_docstrings( + checkpoint="bhadresh-savani/electra-base-discriminator-finetuned-conll03-english", + output_type=TFTokenClassifierOutput, + config_class=_CONFIG_FOR_DOC, + expected_output="['B-LOC', 'B-ORG', 'O', 'O', 'O', 'O', 'O', 'B-LOC', 'O', 'B-LOC', 'I-LOC']", + expected_loss=0.11, + ) + def call( + self, + input_ids: TFModelInputType | None = None, + attention_mask: np.ndarray | tf.Tensor | None = None, + token_type_ids: np.ndarray | tf.Tensor | None = None, + position_ids: np.ndarray | tf.Tensor | None = None, + head_mask: np.ndarray | tf.Tensor | None = None, + inputs_embeds: np.ndarray | tf.Tensor | None = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + labels: np.ndarray | tf.Tensor | None = None, + training: Optional[bool] = False, + ) -> Union[TFTokenClassifierOutput, tuple[tf.Tensor]]: + r""" + labels (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`. + """ + discriminator_hidden_states = self.electra( + input_ids=input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + training=training, + ) + discriminator_sequence_output = discriminator_hidden_states[0] + discriminator_sequence_output = self.dropout(discriminator_sequence_output) + logits = self.classifier(discriminator_sequence_output) + loss = None if labels is None else self.hf_compute_loss(labels, logits) + + if not return_dict: + output = (logits,) + discriminator_hidden_states[1:] + + return ((loss,) + output) if loss is not None else output + + return TFTokenClassifierOutput( + loss=loss, + logits=logits, + hidden_states=discriminator_hidden_states.hidden_states, + attentions=discriminator_hidden_states.attentions, + ) + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "electra", None) is not None: + with tf.name_scope(self.electra.name): + self.electra.build(None) + if getattr(self, "classifier", None) is not None: + with tf.name_scope(self.classifier.name): + self.classifier.build([None, None, self.config.hidden_size]) + + +@add_start_docstrings( + """ + Electra Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear + layers on top of the hidden-states output to compute `span start logits` and `span end logits`). + """, + ELECTRA_START_DOCSTRING, +) +class TFElectraForQuestionAnswering(TFElectraPreTrainedModel, TFQuestionAnsweringLoss): + def __init__(self, config, *inputs, **kwargs): + super().__init__(config, *inputs, **kwargs) + + self.num_labels = config.num_labels + self.electra = TFElectraMainLayer(config, name="electra") + self.qa_outputs = keras.layers.Dense( + config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="qa_outputs" + ) + self.config = config + + @unpack_inputs + @add_start_docstrings_to_model_forward(ELECTRA_INPUTS_DOCSTRING.format("batch_size, sequence_length")) + @add_code_sample_docstrings( + checkpoint="bhadresh-savani/electra-base-squad2", + output_type=TFQuestionAnsweringModelOutput, + config_class=_CONFIG_FOR_DOC, + qa_target_start_index=11, + qa_target_end_index=12, + expected_output="'a nice puppet'", + expected_loss=2.64, + ) + def call( + self, + input_ids: TFModelInputType | None = None, + attention_mask: np.ndarray | tf.Tensor | None = None, + token_type_ids: np.ndarray | tf.Tensor | None = None, + position_ids: np.ndarray | tf.Tensor | None = None, + head_mask: np.ndarray | tf.Tensor | None = None, + inputs_embeds: np.ndarray | tf.Tensor | None = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + start_positions: np.ndarray | tf.Tensor | None = None, + end_positions: np.ndarray | tf.Tensor | None = None, + training: Optional[bool] = False, + ) -> Union[TFQuestionAnsweringModelOutput, tuple[tf.Tensor]]: + r""" + start_positions (`tf.Tensor` of shape `(batch_size,)`, *optional*): + Labels for position (index) of the start of the labelled span for computing the token classification loss. + Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence + are not taken into account for computing the loss. + end_positions (`tf.Tensor` of shape `(batch_size,)`, *optional*): + Labels for position (index) of the end of the labelled span for computing the token classification loss. + Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence + are not taken into account for computing the loss. + """ + discriminator_hidden_states = self.electra( + input_ids=input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + training=training, + ) + discriminator_sequence_output = discriminator_hidden_states[0] + logits = self.qa_outputs(discriminator_sequence_output) + start_logits, end_logits = tf.split(logits, 2, axis=-1) + start_logits = tf.squeeze(start_logits, axis=-1) + end_logits = tf.squeeze(end_logits, axis=-1) + loss = None + + if start_positions is not None and end_positions is not None: + labels = {"start_position": start_positions} + labels["end_position"] = end_positions + loss = self.hf_compute_loss(labels, (start_logits, end_logits)) + + if not return_dict: + output = ( + start_logits, + end_logits, + ) + discriminator_hidden_states[1:] + + return ((loss,) + output) if loss is not None else output + + return TFQuestionAnsweringModelOutput( + loss=loss, + start_logits=start_logits, + end_logits=end_logits, + hidden_states=discriminator_hidden_states.hidden_states, + attentions=discriminator_hidden_states.attentions, + ) + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "electra", None) is not None: + with tf.name_scope(self.electra.name): + self.electra.build(None) + if getattr(self, "qa_outputs", None) is not None: + with tf.name_scope(self.qa_outputs.name): + self.qa_outputs.build([None, None, self.config.hidden_size]) + + +__all__ = [ + "TFElectraForMaskedLM", + "TFElectraForMultipleChoice", + "TFElectraForPreTraining", + "TFElectraForQuestionAnswering", + "TFElectraForSequenceClassification", + "TFElectraForTokenClassification", + "TFElectraModel", + "TFElectraPreTrainedModel", +] diff --git a/transformers/src/transformers/models/electra/tokenization_electra.py b/transformers/src/transformers/models/electra/tokenization_electra.py new file mode 100644 index 0000000000000000000000000000000000000000..e252a1991048ddc9936678c43b32d869c0f77f73 --- /dev/null +++ b/transformers/src/transformers/models/electra/tokenization_electra.py @@ -0,0 +1,482 @@ +# coding=utf-8 +# Copyright 2020 The Google AI Team, Stanford University and The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import collections +import os +import unicodedata +from typing import Optional + +from ...tokenization_utils import PreTrainedTokenizer, _is_control, _is_punctuation, _is_whitespace +from ...utils import logging + + +logger = logging.get_logger(__name__) + +VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt"} + + +# Copied from transformers.models.bert.tokenization_bert.load_vocab +def load_vocab(vocab_file): + """Loads a vocabulary file into a dictionary.""" + vocab = collections.OrderedDict() + with open(vocab_file, "r", encoding="utf-8") as reader: + tokens = reader.readlines() + for index, token in enumerate(tokens): + token = token.rstrip("\n") + vocab[token] = index + return vocab + + +# Copied from transformers.models.bert.tokenization_bert.whitespace_tokenize +def whitespace_tokenize(text): + """Runs basic whitespace cleaning and splitting on a piece of text.""" + text = text.strip() + if not text: + return [] + tokens = text.split() + return tokens + + +# Copied from transformers.models.bert.tokenization_bert.BertTokenizer with Bert->Electra,BERT->Electra +class ElectraTokenizer(PreTrainedTokenizer): + r""" + Construct a Electra tokenizer. Based on WordPiece. + + This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to + this superclass for more information regarding those methods. + + Args: + vocab_file (`str`): + File containing the vocabulary. + do_lower_case (`bool`, *optional*, defaults to `True`): + Whether or not to lowercase the input when tokenizing. + do_basic_tokenize (`bool`, *optional*, defaults to `True`): + Whether or not to do basic tokenization before WordPiece. + never_split (`Iterable`, *optional*): + Collection of tokens which will never be split during tokenization. Only has an effect when + `do_basic_tokenize=True` + unk_token (`str`, *optional*, defaults to `"[UNK]"`): + The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this + token instead. + sep_token (`str`, *optional*, defaults to `"[SEP]"`): + The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for + sequence classification or for a text and a question for question answering. It is also used as the last + token of a sequence built with special tokens. + pad_token (`str`, *optional*, defaults to `"[PAD]"`): + The token used for padding, for example when batching sequences of different lengths. + cls_token (`str`, *optional*, defaults to `"[CLS]"`): + The classifier token which is used when doing sequence classification (classification of the whole sequence + instead of per-token classification). It is the first token of the sequence when built with special tokens. + mask_token (`str`, *optional*, defaults to `"[MASK]"`): + The token used for masking values. This is the token used when training this model with masked language + modeling. This is the token which the model will try to predict. + tokenize_chinese_chars (`bool`, *optional*, defaults to `True`): + Whether or not to tokenize Chinese characters. + + This should likely be deactivated for Japanese (see this + [issue](https://github.com/huggingface/transformers/issues/328)). + strip_accents (`bool`, *optional*): + Whether or not to strip all accents. If this option is not specified, then it will be determined by the + value for `lowercase` (as in the original Electra). + clean_up_tokenization_spaces (`bool`, *optional*, defaults to `True`): + Whether or not to cleanup spaces after decoding, cleanup consists in removing potential artifacts like + extra spaces. + """ + + vocab_files_names = VOCAB_FILES_NAMES + + def __init__( + self, + vocab_file, + do_lower_case=True, + do_basic_tokenize=True, + never_split=None, + unk_token="[UNK]", + sep_token="[SEP]", + pad_token="[PAD]", + cls_token="[CLS]", + mask_token="[MASK]", + tokenize_chinese_chars=True, + strip_accents=None, + clean_up_tokenization_spaces=True, + **kwargs, + ): + if not os.path.isfile(vocab_file): + raise ValueError( + f"Can't find a vocabulary file at path '{vocab_file}'. To load the vocabulary from a Google pretrained" + " model use `tokenizer = ElectraTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`" + ) + self.vocab = load_vocab(vocab_file) + self.ids_to_tokens = collections.OrderedDict([(ids, tok) for tok, ids in self.vocab.items()]) + self.do_basic_tokenize = do_basic_tokenize + if do_basic_tokenize: + self.basic_tokenizer = BasicTokenizer( + do_lower_case=do_lower_case, + never_split=never_split, + tokenize_chinese_chars=tokenize_chinese_chars, + strip_accents=strip_accents, + ) + + self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=str(unk_token)) + + super().__init__( + do_lower_case=do_lower_case, + do_basic_tokenize=do_basic_tokenize, + never_split=never_split, + unk_token=unk_token, + sep_token=sep_token, + pad_token=pad_token, + cls_token=cls_token, + mask_token=mask_token, + tokenize_chinese_chars=tokenize_chinese_chars, + strip_accents=strip_accents, + clean_up_tokenization_spaces=clean_up_tokenization_spaces, + **kwargs, + ) + + @property + def do_lower_case(self): + return self.basic_tokenizer.do_lower_case + + @property + def vocab_size(self): + return len(self.vocab) + + def get_vocab(self): + return dict(self.vocab, **self.added_tokens_encoder) + + def _tokenize(self, text, split_special_tokens=False): + split_tokens = [] + if self.do_basic_tokenize: + for token in self.basic_tokenizer.tokenize( + text, never_split=self.all_special_tokens if not split_special_tokens else None + ): + # If the token is part of the never_split set + if token in self.basic_tokenizer.never_split: + split_tokens.append(token) + else: + split_tokens += self.wordpiece_tokenizer.tokenize(token) + else: + split_tokens = self.wordpiece_tokenizer.tokenize(text) + return split_tokens + + def _convert_token_to_id(self, token): + """Converts a token (str) in an id using the vocab.""" + return self.vocab.get(token, self.vocab.get(self.unk_token)) + + def _convert_id_to_token(self, index): + """Converts an index (integer) in a token (str) using the vocab.""" + return self.ids_to_tokens.get(index, self.unk_token) + + def convert_tokens_to_string(self, tokens): + """Converts a sequence of tokens (string) in a single string.""" + out_string = " ".join(tokens).replace(" ##", "").strip() + return out_string + + def build_inputs_with_special_tokens( + self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None + ) -> list[int]: + """ + Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and + adding special tokens. A Electra sequence has the following format: + + - single sequence: `[CLS] X [SEP]` + - pair of sequences: `[CLS] A [SEP] B [SEP]` + + Args: + token_ids_0 (`List[int]`): + List of IDs to which the special tokens will be added. + token_ids_1 (`List[int]`, *optional*): + Optional second list of IDs for sequence pairs. + + Returns: + `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens. + """ + if token_ids_1 is None: + return [self.cls_token_id] + token_ids_0 + [self.sep_token_id] + cls = [self.cls_token_id] + sep = [self.sep_token_id] + return cls + token_ids_0 + sep + token_ids_1 + sep + + def get_special_tokens_mask( + self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None, already_has_special_tokens: bool = False + ) -> list[int]: + """ + Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding + special tokens using the tokenizer `prepare_for_model` method. + + Args: + token_ids_0 (`List[int]`): + List of IDs. + token_ids_1 (`List[int]`, *optional*): + Optional second list of IDs for sequence pairs. + already_has_special_tokens (`bool`, *optional*, defaults to `False`): + Whether or not the token list is already formatted with special tokens for the model. + + Returns: + `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token. + """ + + if already_has_special_tokens: + return super().get_special_tokens_mask( + token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True + ) + + if token_ids_1 is not None: + return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1] + return [1] + ([0] * len(token_ids_0)) + [1] + + def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]: + index = 0 + if os.path.isdir(save_directory): + vocab_file = os.path.join( + save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"] + ) + else: + vocab_file = (filename_prefix + "-" if filename_prefix else "") + save_directory + with open(vocab_file, "w", encoding="utf-8") as writer: + for token, token_index in sorted(self.vocab.items(), key=lambda kv: kv[1]): + if index != token_index: + logger.warning( + f"Saving vocabulary to {vocab_file}: vocabulary indices are not consecutive." + " Please check that the vocabulary is not corrupted!" + ) + index = token_index + writer.write(token + "\n") + index += 1 + return (vocab_file,) + + +# Copied from transformers.models.bert.tokenization_bert.BasicTokenizer +class BasicTokenizer: + """ + Constructs a BasicTokenizer that will run basic tokenization (punctuation splitting, lower casing, etc.). + + Args: + do_lower_case (`bool`, *optional*, defaults to `True`): + Whether or not to lowercase the input when tokenizing. + never_split (`Iterable`, *optional*): + Collection of tokens which will never be split during tokenization. Only has an effect when + `do_basic_tokenize=True` + tokenize_chinese_chars (`bool`, *optional*, defaults to `True`): + Whether or not to tokenize Chinese characters. + + This should likely be deactivated for Japanese (see this + [issue](https://github.com/huggingface/transformers/issues/328)). + strip_accents (`bool`, *optional*): + Whether or not to strip all accents. If this option is not specified, then it will be determined by the + value for `lowercase` (as in the original BERT). + do_split_on_punc (`bool`, *optional*, defaults to `True`): + In some instances we want to skip the basic punctuation splitting so that later tokenization can capture + the full context of the words, such as contractions. + """ + + def __init__( + self, + do_lower_case=True, + never_split=None, + tokenize_chinese_chars=True, + strip_accents=None, + do_split_on_punc=True, + ): + if never_split is None: + never_split = [] + self.do_lower_case = do_lower_case + self.never_split = set(never_split) + self.tokenize_chinese_chars = tokenize_chinese_chars + self.strip_accents = strip_accents + self.do_split_on_punc = do_split_on_punc + + def tokenize(self, text, never_split=None): + """ + Basic Tokenization of a piece of text. For sub-word tokenization, see WordPieceTokenizer. + + Args: + never_split (`List[str]`, *optional*) + Kept for backward compatibility purposes. Now implemented directly at the base class level (see + [`PreTrainedTokenizer.tokenize`]) List of token not to split. + """ + # union() returns a new set by concatenating the two sets. + never_split = self.never_split.union(set(never_split)) if never_split else self.never_split + text = self._clean_text(text) + + # This was added on November 1st, 2018 for the multilingual and Chinese + # models. This is also applied to the English models now, but it doesn't + # matter since the English models were not trained on any Chinese data + # and generally don't have any Chinese data in them (there are Chinese + # characters in the vocabulary because Wikipedia does have some Chinese + # words in the English Wikipedia.). + if self.tokenize_chinese_chars: + text = self._tokenize_chinese_chars(text) + # prevents treating the same character with different unicode codepoints as different characters + unicode_normalized_text = unicodedata.normalize("NFC", text) + orig_tokens = whitespace_tokenize(unicode_normalized_text) + split_tokens = [] + for token in orig_tokens: + if token not in never_split: + if self.do_lower_case: + token = token.lower() + if self.strip_accents is not False: + token = self._run_strip_accents(token) + elif self.strip_accents: + token = self._run_strip_accents(token) + split_tokens.extend(self._run_split_on_punc(token, never_split)) + + output_tokens = whitespace_tokenize(" ".join(split_tokens)) + return output_tokens + + def _run_strip_accents(self, text): + """Strips accents from a piece of text.""" + text = unicodedata.normalize("NFD", text) + output = [] + for char in text: + cat = unicodedata.category(char) + if cat == "Mn": + continue + output.append(char) + return "".join(output) + + def _run_split_on_punc(self, text, never_split=None): + """Splits punctuation on a piece of text.""" + if not self.do_split_on_punc or (never_split is not None and text in never_split): + return [text] + chars = list(text) + i = 0 + start_new_word = True + output = [] + while i < len(chars): + char = chars[i] + if _is_punctuation(char): + output.append([char]) + start_new_word = True + else: + if start_new_word: + output.append([]) + start_new_word = False + output[-1].append(char) + i += 1 + + return ["".join(x) for x in output] + + def _tokenize_chinese_chars(self, text): + """Adds whitespace around any CJK character.""" + output = [] + for char in text: + cp = ord(char) + if self._is_chinese_char(cp): + output.append(" ") + output.append(char) + output.append(" ") + else: + output.append(char) + return "".join(output) + + def _is_chinese_char(self, cp): + """Checks whether CP is the codepoint of a CJK character.""" + # This defines a "chinese character" as anything in the CJK Unicode block: + # https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block) + # + # Note that the CJK Unicode block is NOT all Japanese and Korean characters, + # despite its name. The modern Korean Hangul alphabet is a different block, + # as is Japanese Hiragana and Katakana. Those alphabets are used to write + # space-separated words, so they are not treated specially and handled + # like the all of the other languages. + if ( + (cp >= 0x4E00 and cp <= 0x9FFF) + or (cp >= 0x3400 and cp <= 0x4DBF) # + or (cp >= 0x20000 and cp <= 0x2A6DF) # + or (cp >= 0x2A700 and cp <= 0x2B73F) # + or (cp >= 0x2B740 and cp <= 0x2B81F) # + or (cp >= 0x2B820 and cp <= 0x2CEAF) # + or (cp >= 0xF900 and cp <= 0xFAFF) + or (cp >= 0x2F800 and cp <= 0x2FA1F) # + ): # + return True + + return False + + def _clean_text(self, text): + """Performs invalid character removal and whitespace cleanup on text.""" + output = [] + for char in text: + cp = ord(char) + if cp == 0 or cp == 0xFFFD or _is_control(char): + continue + if _is_whitespace(char): + output.append(" ") + else: + output.append(char) + return "".join(output) + + +# Copied from transformers.models.bert.tokenization_bert.WordpieceTokenizer +class WordpieceTokenizer: + """Runs WordPiece tokenization.""" + + def __init__(self, vocab, unk_token, max_input_chars_per_word=100): + self.vocab = vocab + self.unk_token = unk_token + self.max_input_chars_per_word = max_input_chars_per_word + + def tokenize(self, text): + """ + Tokenizes a piece of text into its word pieces. This uses a greedy longest-match-first algorithm to perform + tokenization using the given vocabulary. + + For example, `input = "unaffable"` will return as output `["un", "##aff", "##able"]`. + + Args: + text: A single token or whitespace separated tokens. This should have + already been passed through *BasicTokenizer*. + + Returns: + A list of wordpiece tokens. + """ + + output_tokens = [] + for token in whitespace_tokenize(text): + chars = list(token) + if len(chars) > self.max_input_chars_per_word: + output_tokens.append(self.unk_token) + continue + + is_bad = False + start = 0 + sub_tokens = [] + while start < len(chars): + end = len(chars) + cur_substr = None + while start < end: + substr = "".join(chars[start:end]) + if start > 0: + substr = "##" + substr + if substr in self.vocab: + cur_substr = substr + break + end -= 1 + if cur_substr is None: + is_bad = True + break + sub_tokens.append(cur_substr) + start = end + + if is_bad: + output_tokens.append(self.unk_token) + else: + output_tokens.extend(sub_tokens) + return output_tokens + + +__all__ = ["ElectraTokenizer"] diff --git a/transformers/src/transformers/models/electra/tokenization_electra_fast.py b/transformers/src/transformers/models/electra/tokenization_electra_fast.py new file mode 100644 index 0000000000000000000000000000000000000000..db0285581ed1eea5b903a3bed573bbf6408e0167 --- /dev/null +++ b/transformers/src/transformers/models/electra/tokenization_electra_fast.py @@ -0,0 +1,143 @@ +# coding=utf-8 +# Copyright 2020 The Google AI Team, Stanford University and The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import json +from typing import Optional + +from tokenizers import normalizers + +from ...tokenization_utils_fast import PreTrainedTokenizerFast +from .tokenization_electra import ElectraTokenizer + + +VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt", "tokenizer_file": "tokenizer.json"} + + +# Copied from transformers.models.bert.tokenization_bert_fast.BertTokenizerFast with Bert->Electra , BERT->ELECTRA +class ElectraTokenizerFast(PreTrainedTokenizerFast): + r""" + Construct a "fast" ELECTRA tokenizer (backed by HuggingFace's *tokenizers* library). Based on WordPiece. + + This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should + refer to this superclass for more information regarding those methods. + + Args: + vocab_file (`str`): + File containing the vocabulary. + do_lower_case (`bool`, *optional*, defaults to `True`): + Whether or not to lowercase the input when tokenizing. + unk_token (`str`, *optional*, defaults to `"[UNK]"`): + The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this + token instead. + sep_token (`str`, *optional*, defaults to `"[SEP]"`): + The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for + sequence classification or for a text and a question for question answering. It is also used as the last + token of a sequence built with special tokens. + pad_token (`str`, *optional*, defaults to `"[PAD]"`): + The token used for padding, for example when batching sequences of different lengths. + cls_token (`str`, *optional*, defaults to `"[CLS]"`): + The classifier token which is used when doing sequence classification (classification of the whole sequence + instead of per-token classification). It is the first token of the sequence when built with special tokens. + mask_token (`str`, *optional*, defaults to `"[MASK]"`): + The token used for masking values. This is the token used when training this model with masked language + modeling. This is the token which the model will try to predict. + clean_text (`bool`, *optional*, defaults to `True`): + Whether or not to clean the text before tokenization by removing any control characters and replacing all + whitespaces by the classic one. + tokenize_chinese_chars (`bool`, *optional*, defaults to `True`): + Whether or not to tokenize Chinese characters. This should likely be deactivated for Japanese (see [this + issue](https://github.com/huggingface/transformers/issues/328)). + strip_accents (`bool`, *optional*): + Whether or not to strip all accents. If this option is not specified, then it will be determined by the + value for `lowercase` (as in the original ELECTRA). + wordpieces_prefix (`str`, *optional*, defaults to `"##"`): + The prefix for subwords. + """ + + vocab_files_names = VOCAB_FILES_NAMES + slow_tokenizer_class = ElectraTokenizer + + def __init__( + self, + vocab_file=None, + tokenizer_file=None, + do_lower_case=True, + unk_token="[UNK]", + sep_token="[SEP]", + pad_token="[PAD]", + cls_token="[CLS]", + mask_token="[MASK]", + tokenize_chinese_chars=True, + strip_accents=None, + **kwargs, + ): + super().__init__( + vocab_file, + tokenizer_file=tokenizer_file, + do_lower_case=do_lower_case, + unk_token=unk_token, + sep_token=sep_token, + pad_token=pad_token, + cls_token=cls_token, + mask_token=mask_token, + tokenize_chinese_chars=tokenize_chinese_chars, + strip_accents=strip_accents, + **kwargs, + ) + + normalizer_state = json.loads(self.backend_tokenizer.normalizer.__getstate__()) + if ( + normalizer_state.get("lowercase", do_lower_case) != do_lower_case + or normalizer_state.get("strip_accents", strip_accents) != strip_accents + or normalizer_state.get("handle_chinese_chars", tokenize_chinese_chars) != tokenize_chinese_chars + ): + normalizer_class = getattr(normalizers, normalizer_state.pop("type")) + normalizer_state["lowercase"] = do_lower_case + normalizer_state["strip_accents"] = strip_accents + normalizer_state["handle_chinese_chars"] = tokenize_chinese_chars + self.backend_tokenizer.normalizer = normalizer_class(**normalizer_state) + + self.do_lower_case = do_lower_case + + def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None): + """ + Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and + adding special tokens. A ELECTRA sequence has the following format: + + - single sequence: `[CLS] X [SEP]` + - pair of sequences: `[CLS] A [SEP] B [SEP]` + + Args: + token_ids_0 (`List[int]`): + List of IDs to which the special tokens will be added. + token_ids_1 (`List[int]`, *optional*): + Optional second list of IDs for sequence pairs. + + Returns: + `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens. + """ + output = [self.cls_token_id] + token_ids_0 + [self.sep_token_id] + + if token_ids_1 is not None: + output += token_ids_1 + [self.sep_token_id] + + return output + + def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]: + files = self._tokenizer.model.save(save_directory, name=filename_prefix) + return tuple(files) + + +__all__ = ["ElectraTokenizerFast"] diff --git a/transformers/src/transformers/models/eomt/__init__.py b/transformers/src/transformers/models/eomt/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..9f4fe6327b312ff5f60ffb08c4b76566bf63f3f9 --- /dev/null +++ b/transformers/src/transformers/models/eomt/__init__.py @@ -0,0 +1,29 @@ +# Copyright 2025 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import TYPE_CHECKING + +from ...utils import _LazyModule +from ...utils.import_utils import define_import_structure + + +if TYPE_CHECKING: + from .configuration_eomt import * + from .image_processing_eomt import * + from .image_processing_eomt_fast import * + from .modeling_eomt import * +else: + import sys + + _file = globals()["__file__"] + sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__) diff --git a/transformers/src/transformers/models/eomt/configuration_eomt.py b/transformers/src/transformers/models/eomt/configuration_eomt.py new file mode 100644 index 0000000000000000000000000000000000000000..670250721150e60df3d5da9280197cdad461beef --- /dev/null +++ b/transformers/src/transformers/models/eomt/configuration_eomt.py @@ -0,0 +1,168 @@ +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +# This file was automatically generated from src/transformers/models/eomt/modular_eomt.py. +# Do NOT edit this file manually as any edits will be overwritten by the generation of +# the file from the modular. If any change should be done, please apply the change to the +# modular_eomt.py file directly. One of our CI enforces this. +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +# coding=utf-8 +# Copyright 2025 Mobile Perception Systems Lab at TU/e and The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from ...configuration_utils import PretrainedConfig + + +class EomtConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`EomtForUniversalSegmentation`]. It is used to instantiate an EoMT model + according to the specified arguments, defining the model architecture. Instantiating a configuration with the + defaults will yield a similar configuration to that of the EoMT + [tue-mps/coco_panoptic_eomt_large_640](https://huggingface.co/tue-mps/coco_panoptic_eomt_large_640) + architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + Args: + hidden_size (`int`, *optional*, defaults to 1024): + Dimensionality of the hidden representations. + num_hidden_layers (`int`, *optional*, defaults to 24): + Number of hidden layers in the Transformer encoder. + num_attention_heads (`int`, *optional*, defaults to 16): + Number of attention heads in each attention layer. + mlp_ratio (`int`, *optional*, defaults to 4): + Ratio of the MLP hidden dimensionality to the hidden size. + hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`): + The non-linear activation function (function or string) in the encoder. + hidden_dropout_prob (`float`, *optional*, defaults to 0.0): + The dropout probability for all fully connected layers in the embeddings and encoder. + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + layer_norm_eps (`float`, *optional*, defaults to 1e-06): + The epsilon used by the layer normalization layers. + image_size (`int`, *optional*, defaults to 640): + The size (resolution) of each input image. + patch_size (`int`, *optional*, defaults to 16): + The size (resolution) of each patch. + num_channels (`int`, *optional*, defaults to 3): + The number of input channels. + layerscale_value (`float`, *optional*, defaults to 1.0): + Initial value for the LayerScale parameter. + drop_path_rate (`float`, *optional*, defaults to 0.0): + The stochastic depth rate (drop path) used during training. + num_upscale_blocks (`int`, *optional*, defaults to 2): + Number of upsampling blocks used in the decoder or segmentation head. + attention_dropout (`float`, *optional*, defaults to 0.0): + Dropout probability applied after attention projection. + use_swiglu_ffn (`bool`, *optional*, defaults to `False`): + Whether to use the SwiGLU feedforward neural network. + num_blocks (`int`, *optional*, defaults to 4): + Number of feature blocks or stages in the architecture. + no_object_weight (`float`, *optional*, defaults to 0.1): + Loss weight for the 'no object' class in panoptic/instance segmentation. + class_weight (`float`, *optional*, defaults to 2.0): + Loss weight for classification targets. + mask_weight (`float`, *optional*, defaults to 5.0): + Loss weight for mask prediction. + dice_weight (`float`, *optional*, defaults to 5.0): + Loss weight for the dice loss component. + train_num_points (`int`, *optional*, defaults to 12544): + Number of points to sample for mask loss computation during training. + oversample_ratio (`float`, *optional*, defaults to 3.0): + Oversampling ratio used in point sampling for mask training. + importance_sample_ratio (`float`, *optional*, defaults to 0.75): + Ratio of points to sample based on importance during training. + num_queries (`int`, *optional*, defaults to 200): + Number of object queries in the Transformer. + num_register_tokens (`int`, *optional*, defaults to 4): + Number of learnable register tokens added to the transformer input. + + Example: + + ```python + >>> from transformers import EomtConfig, EomtForUniversalSegmentation + + >>> # Initialize configuration + >>> config = EomtConfig() + + >>> # Initialize model + >>> model = EomtForUniversalSegmentation(config) + + >>> # Access config + >>> config = model.config + ```""" + + model_type = "eomt" + + def __init__( + self, + hidden_size=1024, + num_hidden_layers=24, + num_attention_heads=16, + mlp_ratio=4, + hidden_act="gelu", + hidden_dropout_prob=0.0, + initializer_range=0.02, + layer_norm_eps=1e-6, + image_size=640, + patch_size=16, + num_channels=3, + layerscale_value=1.0, + drop_path_rate=0.0, + num_upscale_blocks=2, + attention_dropout=0.0, + use_swiglu_ffn=False, + num_blocks=4, + no_object_weight: float = 0.1, + class_weight: float = 2.0, + mask_weight: float = 5.0, + dice_weight: float = 5.0, + train_num_points: int = 12544, + oversample_ratio: float = 3.0, + importance_sample_ratio: float = 0.75, + num_queries=200, + num_register_tokens=4, + **kwargs, + ): + super().__init__(**kwargs) + + self.hidden_size = hidden_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.hidden_act = hidden_act + self.hidden_dropout_prob = hidden_dropout_prob + self.initializer_range = initializer_range + self.layer_norm_eps = layer_norm_eps + self.image_size = image_size + self.patch_size = patch_size + self.num_channels = num_channels + + self.mlp_ratio = mlp_ratio + self.attention_dropout = attention_dropout + self.layerscale_value = layerscale_value + self.drop_path_rate = drop_path_rate + self.num_upscale_blocks = num_upscale_blocks + self.use_swiglu_ffn = use_swiglu_ffn + self.num_blocks = num_blocks + self.no_object_weight = no_object_weight + self.class_weight = class_weight + self.mask_weight = mask_weight + self.dice_weight = dice_weight + self.train_num_points = train_num_points + self.oversample_ratio = oversample_ratio + self.importance_sample_ratio = importance_sample_ratio + self.num_queries = num_queries + self.num_register_tokens = num_register_tokens + + +__all__ = ["EomtConfig"] diff --git a/transformers/src/transformers/models/eomt/convert_eomt_to_hf.py b/transformers/src/transformers/models/eomt/convert_eomt_to_hf.py new file mode 100644 index 0000000000000000000000000000000000000000..6d822c1bfc8697930ca476e366fbbe5743f94410 --- /dev/null +++ b/transformers/src/transformers/models/eomt/convert_eomt_to_hf.py @@ -0,0 +1,340 @@ +# coding=utf-8 +# Copyright 2025 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse +import gc +import json +import os +import re +from typing import Optional + +import torch +from accelerate import init_empty_weights +from huggingface_hub import snapshot_download + +from transformers import EomtConfig, EomtForUniversalSegmentation, EomtImageProcessorFast + + +# fmt: off +MAPPINGS = { + # Embeddings + r"network.encoder.backbone.cls_token" : r"embeddings.cls_token", + r"network.encoder.backbone.reg_token" : r"embeddings.register_tokens", + r"network.encoder.backbone.pos_embed" : r"embeddings.position_embeddings.weight", + r"network.encoder.backbone.patch_embed.proj" : r"embeddings.patch_embeddings.projection", + + # Encoder Block + r"network.encoder.backbone.blocks.(\d+).norm1" : r"layers.\1.norm1", + r"network.encoder.backbone.blocks.(\d+).attn.proj" : r"layers.\1.attention.out_proj", + r"network.encoder.backbone.blocks.(\d+).ls1.gamma" : r"layers.\1.layer_scale1.lambda1", + r"network.encoder.backbone.blocks.(\d+).norm2" : r"layers.\1.norm2", + r"network.encoder.backbone.blocks.(\d+).ls2.gamma" : r"layers.\1.layer_scale2.lambda1", + r"network.encoder.backbone.blocks.(\d+).attn" : r"layers.\1.attention", + + # Others + r"network.q.weight" : r"query.weight", + r"network.class_head" : r"class_predictor", + r"network.upscale.(\d+).conv1" : r"upscale_block.block.\1.conv1", + r"network.upscale.(\d+).conv2" : r"upscale_block.block.\1.conv2", + r"network.upscale.(\d+).norm" : r"upscale_block.block.\1.layernorm2d", + r"network.mask_head.0" : r"mask_head.fc1", + r"network.mask_head.2" : r"mask_head.fc2", + r"network.mask_head.4" : r"mask_head.fc3", + r"network.encoder.backbone.norm" : r"layernorm", + r"network.attn_mask_probs" : r"attn_mask_probs", +} +# fmt: on + +# Mappings for MLP layers, depending on the type of MLP used in ckpts. +MLP_MAPPINGS = { + "swiglu_ffn": { + r"network.encoder.backbone.blocks.(\d+).mlp.fc1": r"layers.\1.mlp.weights_in", + r"network.encoder.backbone.blocks.(\d+).mlp.fc2": r"layers.\1.mlp.weights_out", + }, + "vanilla_mlp": { + r"network.encoder.backbone.blocks.(\d+).mlp": r"layers.\1.mlp", + }, +} + + +def convert_old_keys_to_new_keys(state_dict): + keys_as_text = "\n".join(state_dict.keys()) + new_keys_as_text = keys_as_text + for old, repl in MAPPINGS.items(): + if repl is None: + new_keys_as_text = re.sub(old, "", new_keys_as_text) + else: + new_keys_as_text = re.sub(old, repl, new_keys_as_text) + output_dict = dict(zip(keys_as_text.split("\n"), new_keys_as_text.split("\n"))) + return output_dict + + +def split_qkv_tensor(key, tensor): + """Splits a qkv tensor into separate q, k, v tensors and updates the key accordingly.""" + + new_keys = ["q_proj", "k_proj", "v_proj"] + split_size = tensor.shape[0] // 3 + split_tensors = torch.split(tensor, split_size, dim=0) + + return {key.replace("qkv", new_key): split_tensors[i] for i, new_key in enumerate(new_keys)} + + +def convert_state_dict_to_hf(state_dict): + """Convert state dict keys to HF format.""" + conversion_dict = convert_old_keys_to_new_keys(state_dict) + converted_state_dict = {} + + for old_key, new_key in conversion_dict.items(): + if new_key: + if "qkv" in new_key: # Detect merged attention keys and split them. + qkv_split_dict = split_qkv_tensor(new_key, state_dict[old_key]) + converted_state_dict.update(qkv_split_dict) + else: + converted_state_dict[new_key] = state_dict[old_key] + + for i in [ + "network.encoder.pixel_mean", + "network.encoder.pixel_std", + ]: + converted_state_dict.pop(i) + + # Embeddings will not have initial dimension + pos_embed_key = "embeddings.position_embeddings.weight" + converted_state_dict[pos_embed_key] = converted_state_dict[pos_embed_key].squeeze(0) + + return converted_state_dict + + +def ensure_model_downloaded( + repo_id: Optional[str] = None, revision: Optional[str] = None, local_dir: Optional[str] = None +) -> str: + """ + Ensures model files are downloaded locally, downloads them if not. + Returns path to local files. + + Args: + repo_id: The Hugging Face model repo ID (required if local_dir not provided) + revision: Optional git revision to use + local_dir: Optional local directory path where model files should be stored/found + """ + if local_dir is not None: + if os.path.exists(local_dir): + print(f"Using provided local directory: {local_dir}") + else: + # Create the local directory if it doesn't exist + os.makedirs(local_dir, exist_ok=True) + print(f"Created local directory: {local_dir}") + + if repo_id is None: + raise ValueError("Either repo_id or local_dir must be provided") + + print(f"Ensuring {repo_id} (revision: {revision or 'latest'}) is downloaded...") + + try: + # First try to find files locally + download_dir = snapshot_download(repo_id, revision=revision, local_files_only=True, local_dir=local_dir) + print(f"Found model files locally at {download_dir}") + return download_dir + except Exception: + # If files not found locally, download them + print(f"Downloading model files for {repo_id}...") + download_dir = snapshot_download(repo_id, revision=revision, local_files_only=False, local_dir=local_dir) + print(f"Downloaded model files to {download_dir}") + return download_dir + + +def load_model_state_dict(input_path: str) -> dict: + """ + Load model state dict, handling both single and sharded files. + """ + index_path = os.path.join(input_path, "pytorch_model.bin.index.json") + single_file_path = os.path.join(input_path, "pytorch_model.bin") + + # Check if we have a sharded model + if os.path.exists(index_path): + print("Loading sharded model...") + state_dict = {} + with open(index_path, "r") as f: + index = json.load(f) + + # Get unique shard files and load each one only once + unique_shard_files = sorted(set(index["weight_map"].values())) + for shard_file in unique_shard_files: + print(f"Loading shard {shard_file}...") + shard_path = os.path.join(input_path, shard_file) + shard_dict = torch.load(shard_path, map_location="cpu") + state_dict.update(shard_dict) + + return state_dict + + # Single file model + elif os.path.exists(single_file_path): + print("Loading single file model...") + return torch.load(single_file_path, map_location="cpu") + + else: + raise ValueError(f"No model files found in {input_path}") + + +def convert_model( + repo_id=None, + local_dir=None, + output_dir=None, + output_hub_path=None, + safe_serialization=True, + revision=None, +): + """Convert and save the model weights, processor, and configuration.""" + if output_dir is None and output_hub_path is None: + raise ValueError("At least one of output_dir or output_hub_path must be specified") + + if repo_id is None and local_dir is None: + raise ValueError("Either repo_id or local_dir must be specified") + + # Create output directory if specified + if output_dir: + os.makedirs(output_dir, exist_ok=True) + print(f"Created/verified output directory: {output_dir}") + + torch.set_default_dtype(torch.float16) + + # Download or locate model files + input_path = ensure_model_downloaded(repo_id=repo_id, revision=revision, local_dir=local_dir) + + with open(os.path.join(input_path, "config.json"), "r") as f: + config_data = json.load(f) + # Pop off unwanted keys + _ = config_data.pop("backbone", None) + + config = EomtConfig( + **{ + **config_data, + "layerscale_value": 1e-5, + } + ) + + if "semantic" in repo_id.split("_"): + size = {"shortest_edge": config.image_size, "longest_edge": None} + do_split_image = True + do_pad = False + else: + size = {"shortest_edge": config.image_size, "longest_edge": config.image_size} + do_split_image = False + do_pad = True + + if "giant" in repo_id.split("_"): + config.use_swiglu_ffn = True + config.hidden_size = 1536 + config.num_hidden_layers = 40 + config.num_attention_heads = 24 + # Update MAPPINGS for ckpts depending on the MLP type + MAPPINGS.update(MLP_MAPPINGS["swiglu_ffn"]) + else: + MAPPINGS.update(MLP_MAPPINGS["vanilla_mlp"]) + + processor = EomtImageProcessorFast(size=size, do_split_image=do_split_image, do_pad=do_pad) + + # Save the config and processor + if output_dir: + config.save_pretrained(output_dir) + processor.save_pretrained(output_dir) + if output_hub_path: + config.push_to_hub(output_hub_path) + processor.push_to_hub(output_hub_path) + + # Initialize model with empty weights + print("Creating empty model...") + with init_empty_weights(): + model = EomtForUniversalSegmentation(config) + + # Load and convert state dict + print("Loading state dict...") + state_dict = load_model_state_dict(input_path) + state_dict = convert_state_dict_to_hf(state_dict) + + # Load converted state dict + print("Loading converted weights into model...") + model.load_state_dict(state_dict, strict=True, assign=True) + + # Save the model + if output_dir: + print(f"Saving model to {output_dir}...") + model.save_pretrained(output_dir, safe_serialization=safe_serialization) + if output_hub_path: + print(f"Pushing model to hub at {output_hub_path}...") + model.push_to_hub(output_hub_path, safe_serialization=safe_serialization) + + del state_dict, model + gc.collect() + + # Validate the saved model if saved locally + if output_dir: + print("Reloading the local model to check if it's saved correctly...") + EomtForUniversalSegmentation.from_pretrained(output_dir, device_map="auto") + print("Local model reloaded successfully.") + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument( + "--hf_repo_id", + help="HuggingFace Hub repo ID for the model", + default=None, + ) + parser.add_argument( + "--local_dir", + help="Local directory containing the model files", + default=None, + ) + parser.add_argument( + "--revision", + help="Specific revision to download from the Hub", + default=None, + ) + parser.add_argument( + "--output_dir", + help="Location to write HF model locally", + default=None, + ) + parser.add_argument( + "--output_hub_path", + help="Repository ID to push model to hub (e.g. 'username/model-name')", + default=None, + ) + parser.add_argument( + "--safe_serialization", + action="store_true", + help="Whether to save using safetensors", + ) + args = parser.parse_args() + + if args.output_dir is None and args.output_hub_path is None: + raise ValueError("At least one of --output_dir or --output_hub_path must be specified") + + if args.hf_repo_id is None and args.local_dir is None: + raise ValueError("Either --hf_repo_id or --local_dir must be specified") + + convert_model( + repo_id=args.hf_repo_id, + local_dir=args.local_dir, + output_dir=args.output_dir, + output_hub_path=args.output_hub_path, + safe_serialization=args.safe_serialization, + revision=args.revision, + ) + + +if __name__ == "__main__": + main() diff --git a/transformers/src/transformers/models/eomt/image_processing_eomt.py b/transformers/src/transformers/models/eomt/image_processing_eomt.py new file mode 100644 index 0000000000000000000000000000000000000000..e63a1be95fe4db52e6d2be685a364ca2f2b518da --- /dev/null +++ b/transformers/src/transformers/models/eomt/image_processing_eomt.py @@ -0,0 +1,973 @@ +# coding=utf-8 +# Copyright 2025 Mobile Perception Systems Lab at TU/e and The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Image processor class for EoMT.""" + +import math +from typing import Optional, Union + +import numpy as np + +from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict +from ...image_transforms import ( + PaddingMode, + pad, + resize, +) +from ...image_utils import ( + ChannelDimension, + ImageInput, + PILImageResampling, + get_image_size, + infer_channel_dimension_format, + make_flat_list_of_images, + make_list_of_images, + to_numpy_array, + valid_images, + validate_preprocess_arguments, +) +from ...utils import ( + IMAGENET_DEFAULT_MEAN, + IMAGENET_DEFAULT_STD, + TensorType, + filter_out_non_signature_kwargs, + is_torch_available, + logging, +) + + +logger = logging.get_logger(__name__) + +if is_torch_available(): + import torch + import torch.nn.functional as F + + +# Adapted from transformers.models.maskformer.image_processing_maskformer.convert_segmentation_map_to_binary_masks +def convert_segmentation_map_to_binary_masks( + segmentation_map: "np.ndarray", + instance_id_to_semantic_id: Optional[dict[int, int]] = None, + ignore_index: Optional[int] = None, +): + if ignore_index is not None: + segmentation_map = np.where(segmentation_map == 0, ignore_index, segmentation_map - 1) + + # Get unique ids (class or instance ids based on input) + all_labels = np.unique(segmentation_map) + + # Drop background label if applicable + if ignore_index is not None: + all_labels = all_labels[all_labels != ignore_index] + + # Generate a binary mask for each object instance + binary_masks = [(segmentation_map == i) for i in all_labels] + + # Stack the binary masks + if binary_masks: + binary_masks = np.stack(binary_masks, axis=0) + else: + binary_masks = np.zeros((0, *segmentation_map.shape)) + + # Convert instance ids to class ids + if instance_id_to_semantic_id is not None: + labels = np.zeros(all_labels.shape[0]) + + for label in all_labels: + class_id = instance_id_to_semantic_id[label + 1 if ignore_index is not None else label] + labels[all_labels == label] = class_id - 1 if ignore_index is not None else class_id + else: + labels = all_labels + + return binary_masks.astype(np.float32), labels.astype(np.int64) + + +def get_size_with_aspect_ratio(image_size, size, max_size=None) -> tuple[int, int]: + """ + Computes the output image size given the input image size and the desired output size. + + Args: + image_size (`tuple[int, int]`): + The input image size. + size (`int`): + The desired output size. + max_size (`int`, *optional*): + The maximum allowed output size. + """ + height, width = image_size + raw_size = None + if max_size is not None: + min_original_size = float(min((height, width))) + max_original_size = float(max((height, width))) + if max_original_size / min_original_size * size > max_size: + raw_size = max_size * min_original_size / max_original_size + size = int(round(raw_size)) + + if (height <= width and height == size) or (width <= height and width == size): + oh, ow = height, width + elif width < height: + ow = size + if max_size is not None and raw_size is not None: + oh = round(raw_size * height / width) + else: + oh = round(size * height / width) + else: + oh = size + if max_size is not None and raw_size is not None: + ow = round(raw_size * width / height) + else: + ow = round(size * width / height) + + return (oh, ow) + + +# Copied from transformers.models.detr.image_processing_detr.remove_low_and_no_objects +def remove_low_and_no_objects(masks, scores, labels, object_mask_threshold, num_labels): + """ + Binarize the given masks using `object_mask_threshold`, it returns the associated values of `masks`, `scores` and + `labels`. + + Args: + masks (`torch.Tensor`): + A tensor of shape `(num_queries, height, width)`. + scores (`torch.Tensor`): + A tensor of shape `(num_queries)`. + labels (`torch.Tensor`): + A tensor of shape `(num_queries)`. + object_mask_threshold (`float`): + A number between 0 and 1 used to binarize the masks. + Raises: + `ValueError`: Raised when the first dimension doesn't match in all input tensors. + Returns: + `tuple[`torch.Tensor`, `torch.Tensor`, `torch.Tensor`]`: The `masks`, `scores` and `labels` without the region + < `object_mask_threshold`. + """ + if not (masks.shape[0] == scores.shape[0] == labels.shape[0]): + raise ValueError("mask, scores and labels must have the same shape!") + + to_keep = labels.ne(num_labels) & (scores > object_mask_threshold) + + return masks[to_keep], scores[to_keep], labels[to_keep] + + +def check_segment_validity(mask_labels, mask_probs, k, mask_threshold=0.5, overlap_mask_area_threshold=0.8): + # Get the mask associated with the k class + mask_k = mask_labels == k + mask_k_area = mask_k.sum() + + # Compute the area of all the stuff in query k + original_mask = mask_probs[k] >= mask_threshold + original_area = original_mask.sum() + + final_mask = mask_k & original_mask + final_mask_area = final_mask.sum() + + mask_exists = mask_k_area > 0 and original_area > 0 and final_mask_area > 0 + + if mask_exists: + area_ratio = mask_k_area / original_area + if not area_ratio.item() > overlap_mask_area_threshold: + mask_exists = False + + return mask_exists, final_mask + + +def compute_segments( + mask_probs, + pred_scores, + pred_labels, + stuff_classes, + mask_threshold: float = 0.5, + overlap_mask_area_threshold: float = 0.8, + target_size: Optional[tuple[int, int]] = None, +): + height = mask_probs.shape[1] if target_size is None else target_size[0] + width = mask_probs.shape[2] if target_size is None else target_size[1] + + segmentation = torch.zeros((height, width), dtype=torch.long, device=mask_probs.device) - 1 + segments: list[dict] = [] + + # Compute per-pixel assignment based on weighted mask scores + mask_probs = mask_probs.sigmoid() + mask_labels = (pred_scores[:, None, None] * mask_probs).argmax(0) + + # Keep track of instances of each class + current_segment_id = 0 + stuff_memory_list: dict[str, int] = {} + + for k in range(pred_labels.shape[0]): + pred_class = pred_labels[k].item() + + # Check if mask exists and large enough to be a segment + mask_exists, final_mask = check_segment_validity( + mask_labels, mask_probs, k, mask_threshold, overlap_mask_area_threshold + ) + + if not mask_exists: + continue + + if stuff_classes and pred_class in stuff_classes: + if pred_class in stuff_memory_list: + segmentation[final_mask] = stuff_memory_list[pred_class] + continue + else: + stuff_memory_list[pred_class] = current_segment_id + + segmentation[final_mask] = current_segment_id + segment_score = round(pred_scores[k].item(), 6) + segments.append( + { + "id": current_segment_id, + "label_id": pred_class, + "score": segment_score, + } + ) + current_segment_id += 1 + return segmentation, segments + + +def get_target_size(size_dict: dict[str, int]) -> tuple[int, int]: + """Returns the height and width from a size dict.""" + target_height = size_dict["shortest_edge"] + target_width = size_dict.get("longest_edge", None) or target_height + + return target_height, target_width + + +class EomtImageProcessor(BaseImageProcessor): + r""" + Constructs a EoMT image processor. The image processor can be used to prepare image(s) and optional targets + for the model. + + This image processor inherits from [`BaseImageProcessor`] which contains most of the main methods. Users should + refer to this superclass for more information regarding those methods. + + Args: + do_resize (`bool`, *optional*, defaults to `True`): + Whether to resize the input to a certain `size`. + size (`int`, *optional*, defaults to 640): + Resize the input to the given size. Only has an effect if `do_resize` is set to `True`. If size is a + sequence like `(width, height)`, output size will be matched to this. If size is an int, smaller edge of + the image will be matched to this number. i.e, if `height > width`, then image will be rescaled to `(size * + height / width, size)`. + resample (`int`, *optional*, defaults to `Resampling.BILINEAR`): + An optional resampling filter. This can be one of `PIL.Image.Resampling.NEAREST`, + `PIL.Image.Resampling.BOX`, `PIL.Image.Resampling.BILINEAR`, `PIL.Image.Resampling.HAMMING`, + `PIL.Image.Resampling.BICUBIC` or `PIL.Image.Resampling.LANCZOS`. Only has an effect if `do_resize` is set + to `True`. + do_rescale (`bool`, *optional*, defaults to `True`): + Whether to rescale the input to a certain `scale`. + rescale_factor (`float`, *optional*, defaults to `1/ 255`): + Rescale the input by the given factor. Only has an effect if `do_rescale` is set to `True`. + do_normalize (`bool`, *optional*, defaults to `True`): + Whether or not to normalize the input with mean and standard deviation. + do_split_image (`bool`, *optional*, defaults to `False`): + Whether to split the input images into overlapping patches for semantic segmentation. If set to `True`, the + input images will be split into patches of size `size["shortest_edge"]` with an overlap between patches. + Otherwise, the input images will be padded to the target size. + do_pad (`bool`, *optional*, defaults to `False`): + Whether to pad the image. If `True`, will pad the patch dimension of the images in the batch to the largest + number of patches in the batch. Padding will be applied to the bottom and right with zeros. + image_mean (`int`, *optional*, defaults to `[0.485, 0.456, 0.406]`): + The sequence of means for each channel, to be used when normalizing images. Defaults to the ImageNet mean. + image_std (`int`, *optional*, defaults to `[0.229, 0.224, 0.225]`): + The sequence of standard deviations for each channel, to be used when normalizing images. Defaults to the + ImageNet std. + ignore_index (`int`, *optional*): + Label to be assigned to background pixels in segmentation maps. If provided, segmentation map pixels + denoted with 0 (background) will be replaced with `ignore_index`. + num_labels (`int`, *optional*): + The number of labels in the segmentation map. + """ + + model_input_names = ["pixel_values"] + + def __init__( + self, + do_resize: bool = True, + size: Optional[dict[str, int]] = None, + resample: PILImageResampling = PILImageResampling.BILINEAR, + do_rescale: bool = True, + rescale_factor: float = 1 / 255, + do_normalize: bool = True, + do_split_image: bool = False, + do_pad: bool = False, + image_mean: Optional[Union[float, list[float]]] = None, + image_std: Optional[Union[float, list[float]]] = None, + ignore_index: Optional[int] = None, + num_labels: Optional[int] = None, + **kwargs, + ): + super().__init__(**kwargs) + + size = size if size is not None else {"shortest_edge": 640, "longest_edge": 640} + size = get_size_dict(size, default_to_square=False) + + self.do_resize = do_resize + self.size = size + self.resample = resample + self.do_rescale = do_rescale + self.rescale_factor = rescale_factor + self.do_normalize = do_normalize + self.do_split_image = do_split_image + self.do_pad = do_pad + self.image_mean = image_mean if image_mean is not None else IMAGENET_DEFAULT_MEAN + self.image_std = image_std if image_std is not None else IMAGENET_DEFAULT_STD + self.ignore_index = ignore_index + self.num_labels = num_labels + + def resize( + self, + image: np.ndarray, + size: dict, + resample: PILImageResampling = PILImageResampling.BILINEAR, + data_format=None, + input_data_format: Optional[Union[str, ChannelDimension]] = None, + **kwargs, + ) -> np.ndarray: + """ + Resize an image. The shortest edge of the image is resized to size["shortest_edge"], with the longest edge + resized to keep the input aspect ratio. + + Args: + image (`np.ndarray`): + Image to resize. + resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BICUBIC`): + Resampling filter to use when resiizing the image. + data_format (`str` or `ChannelDimension`, *optional*): + The channel dimension format of the image. If not provided, it will be the same as the input image. + input_data_format (`ChannelDimension` or `str`, *optional*): + The channel dimension format of the input image. If not provided, it will be inferred. + """ + image_size = get_image_size(image) + output_size = get_size_with_aspect_ratio(image_size, size["shortest_edge"], size["longest_edge"]) + + image = resize( + image=image, + size=output_size, + resample=resample, + data_format=data_format, + input_data_format=input_data_format, + return_numpy=True, + **kwargs, + ) + + return image + + def _split_image(self, image: ImageInput, size: dict, image_index: int) -> tuple[list, list]: + """Slices an image into overlapping patches for semantic segmentation.""" + + patches, patch_offsets = [], [] + + image_size = get_image_size(image) + patch_size = size["shortest_edge"] + + longer_side = max(image_size) + num_patches = math.ceil(longer_side / patch_size) + total_overlap = num_patches * patch_size - longer_side + overlap_per_patch = total_overlap / (num_patches - 1) if num_patches > 1 else 0 + + for i in range(num_patches): + start = int(i * (patch_size - overlap_per_patch)) + end = start + patch_size + + if image_size[0] > image_size[1]: + patch = image[:, start:end, :] + else: + patch = image[:, :, start:end] + + patches.append(patch) + patch_offsets.append([image_index, start, end]) + + return patches, patch_offsets + + def _pad(self, image: ImageInput, size: dict) -> np.ndarray: + """Pads the image to the target size using zero padding.""" + height, width = get_image_size(image) + + target_height, target_width = get_target_size(size) + pad_h = max(0, target_height - height) + pad_w = max(0, target_width - width) + + padding = ((0, pad_h), (0, pad_w)) + + # Channel axis is last; default padding format is compatible + padded_image = pad(image=image, padding=padding, mode=PaddingMode.CONSTANT, constant_values=0.0) + return padded_image + + def _preprocess_images( + self, + images: ImageInput, + do_resize: Optional[bool] = None, + size: Optional[dict[str, int]] = None, + resample: PILImageResampling = None, + do_split_image: Optional[bool] = None, + do_pad: Optional[bool] = None, + do_rescale: Optional[bool] = None, + rescale_factor: Optional[float] = None, + do_normalize: Optional[bool] = None, + image_mean: Optional[Union[float, list[float]]] = None, + image_std: Optional[Union[float, list[float]]] = None, + data_format: Optional[Union[str, ChannelDimension]] = None, + input_data_format: Optional[Union[str, ChannelDimension]] = None, + ) -> np.ndarray: + """Preprocesses a batch of images.""" + images = [to_numpy_array(image) for image in images] + + if do_resize: + images = [ + self.resize( + image, + size=size, + resample=resample, + data_format=data_format, + input_data_format=input_data_format, + ) + for image in images + ] + + processed_images, patch_offsets = [], [] + + if do_split_image: + for idx, img in enumerate(images): + patches, offsets = self._split_image(img, size, idx) + processed_images.extend(patches) + patch_offsets.extend(offsets) + + images = processed_images + + if do_pad: + images = [self._pad(img, size) for img in images] + + if do_rescale: + images = [self.rescale(img, scale=rescale_factor, input_data_format=input_data_format) for img in images] + + if do_normalize: + images = [ + self.normalize( + image, + mean=image_mean, + std=image_std, + input_data_format=input_data_format, + ) + for image in images + ] + + return images, patch_offsets + + def _preprocess_mask( + self, + segmentation_map: ImageInput, + do_resize: Optional[bool] = False, + do_pad: Optional[bool] = False, + size: Optional[dict[str, int]] = None, + resample: PILImageResampling = None, + data_format: Union[str, ChannelDimension] = None, + input_data_format: Optional[Union[str, ChannelDimension]] = None, + ) -> np.ndarray: + """Preprocesses a single mask.""" + # Add channel dimension if missing - needed for certain transformations + if segmentation_map.ndim == 2: + added_channel_dim = True + segmentation_map = segmentation_map[None, ...] + input_data_format = ChannelDimension.FIRST + else: + added_channel_dim = False + if input_data_format is None: + input_data_format = infer_channel_dimension_format(segmentation_map) + + if do_resize: + segmentation_map = self.resize( + segmentation_map, + size=size, + resample=resample, + data_format=data_format, + ) + + if do_pad: + segmentation_map = self._pad(segmentation_map, size) + + # Remove extra channel dimension if added for processing + if added_channel_dim: + segmentation_map = segmentation_map.squeeze(0) + return torch.from_numpy(segmentation_map) + + @filter_out_non_signature_kwargs() + def preprocess( + self, + images: ImageInput, + segmentation_maps: Optional[Union[list[dict[int, int]], dict[int, int]]] = None, + instance_id_to_semantic_id: Optional[dict[int, int]] = None, + do_split_image: Optional[bool] = None, + do_resize: Optional[bool] = None, + size: Optional[dict[str, int]] = None, + resample: PILImageResampling = None, + do_rescale: Optional[bool] = None, + rescale_factor: Optional[float] = None, + do_normalize: Optional[bool] = None, + do_pad: Optional[bool] = None, + image_mean: Optional[Union[float, list[float]]] = None, + image_std: Optional[Union[float, list[float]]] = None, + ignore_index: Optional[int] = None, + return_tensors: Optional[Union[str, TensorType]] = None, + data_format: Union[str, ChannelDimension] = ChannelDimension.FIRST, + input_data_format: Optional[Union[str, ChannelDimension]] = None, + ) -> BatchFeature: + """ + Preprocesses images or a batch of images. + + Args: + images (`ImageInput`): + Image or batch of images to preprocess. + segmentation_maps (`ImageInput`, *optional*): + The corresponding semantic segmentation maps with the pixel-wise annotations. + instance_id_to_semantic_id (`list[dict[int, int]]` or `dict[int, int]`, *optional*): + A mapping between object instance ids and class ids. + do_split_image (`bool`, *optional*, defaults to `self.do_split_image`): + Whether to split the input images into overlapping patches for semantic segmentation. + do_resize (`bool`, *optional*, defaults to `self.do_resize`): + Whether to resize the input images. + size (`dict[str, int]`, *optional*, defaults to `self.size`): + Target size as a dictionary with `"shortest_edge"` and `"longest_edge"` keys. + resample (`PILImageResampling`, *optional*, defaults to `self.resample`): + Resampling filter to use when resizing. + do_rescale (`bool`, *optional*, defaults to `self.do_rescale`): + Whether to rescale the input images by `rescale_factor`. + rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`): + Factor to scale image pixel values. + do_normalize (`bool`, *optional*, defaults to `self.do_normalize`): + Whether to normalize the input images. + do_pad (`bool`, *optional*, defaults to `False`): + Whether to pad the image. If `True`, will pad the patch dimension of the images in the batch to the largest + number of patches in the batch. Padding will be applied to the bottom and right with zeros. + image_mean (`float` or `list[float]`, *optional*, defaults to `self.image_mean`): + Mean for normalization. Single value or list for each channel. + image_std (`float` or `list[float]`, *optional*, defaults to `self.image_std`): + Standard deviation for normalization. Single value or list for each channel. + ignore_index (`int`, *optional*): + Label to be assigned to background pixels in segmentation maps. If provided, segmentation map pixels + denoted with 0 (background) will be replaced with `ignore_index`. + return_tensors (`str` or `TensorType`, *optional*): + The type of tensors to return. Can be `"pt"`, `"tf"`, `"np"`, or `"jax"`. + data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`): + Channel format of the output image. Either `"channels_first"` or `"channels_last"`. + input_data_format (`ChannelDimension` or `str`, *optional*): + Channel format of the input image. + """ + + do_split_image = do_split_image if do_split_image is not None else self.do_split_image + do_resize = do_resize if do_resize is not None else self.do_resize + size = size if size is not None else self.size + size = get_size_dict(size, default_to_square=False) + resample = resample if resample is not None else self.resample + do_rescale = do_rescale if do_rescale is not None else self.do_rescale + rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor + do_normalize = do_normalize if do_normalize is not None else self.do_normalize + do_pad = do_pad if do_pad is not None else self.do_pad + image_mean = image_mean if image_mean is not None else self.image_mean + image_std = image_std if image_std is not None else self.image_std + ignore_index = ignore_index if ignore_index is not None else self.ignore_index + + images = make_flat_list_of_images(images) + + if not valid_images(images): + raise ValueError( + "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, " + "torch.Tensor, tf.Tensor or jax.ndarray." + ) + + validate_preprocess_arguments( + do_rescale=do_rescale, + rescale_factor=rescale_factor, + do_normalize=do_normalize, + image_mean=image_mean, + image_std=image_std, + do_resize=do_resize, + size=size, + resample=resample, + ) + + pixel_values_list, patch_offsets = self._preprocess_images( + images=images, + do_resize=do_resize, + size=size, + resample=resample, + do_split_image=do_split_image, + do_pad=do_pad, + do_rescale=do_rescale, + rescale_factor=rescale_factor, + do_normalize=do_normalize, + image_mean=image_mean, + image_std=image_std, + data_format=data_format, + input_data_format=input_data_format, + ) + + if segmentation_maps is not None: + segmentation_maps = make_list_of_images(segmentation_maps, expected_ndims=2) + segmentation_maps = [to_numpy_array(mask) for mask in segmentation_maps] + + segmentation_maps = [ + self._preprocess_mask( + segmentation_map, + do_resize=do_resize, + do_pad=do_pad, + size=size, + resample=PILImageResampling.NEAREST, + data_format=data_format, + input_data_format=input_data_format, + ) + for segmentation_map in segmentation_maps + ] + + encoded_inputs = self.encode_inputs( + pixel_values_list, + segmentation_maps, + instance_id_to_semantic_id, + ignore_index, + return_tensors, + input_data_format=data_format, + ) + + if do_split_image and patch_offsets: + encoded_inputs["patch_offsets"] = [torch.tensor(offsets) for offsets in patch_offsets] + + return encoded_inputs + + def encode_inputs( + self, + pixel_values_list: list[ImageInput], + segmentation_maps: ImageInput = None, + instance_id_to_semantic_id: Optional[Union[list[dict[int, int]], dict[int, int]]] = None, + ignore_index: Optional[int] = None, + return_tensors: Optional[Union[str, TensorType]] = None, + input_data_format: Optional[Union[str, ChannelDimension]] = None, + ): + """ + Pad images up to the largest image in a batch and create a corresponding `pixel_mask`. + + EoMT addresses semantic segmentation with a mask classification paradigm, thus input segmentation maps + will be converted to lists of binary masks and their respective labels. Let's see an example, assuming + `segmentation_maps = [[2,6,7,9]]`, the output will contain `mask_labels = + [[1,0,0,0],[0,1,0,0],[0,0,1,0],[0,0,0,1]]` (four binary masks) and `class_labels = [2,6,7,9]`, the labels for + each mask. + + Args: + pixel_values_list (`list[ImageInput]`): + list of images (pixel values) to be padded. Each image should be a tensor of shape `(channels, height, + width)`. + + segmentation_maps (`ImageInput`, *optional*): + The corresponding semantic segmentation maps with the pixel-wise annotations. + + (`bool`, *optional*, defaults to `True`): + Whether or not to pad images up to the largest image in a batch and create a pixel mask. + + If left to the default, will return a pixel mask that is: + + - 1 for pixels that are real (i.e. **not masked**), + - 0 for pixels that are padding (i.e. **masked**). + + instance_id_to_semantic_id (`list[dict[int, int]]` or `dict[int, int]`, *optional*): + A mapping between object instance ids and class ids. If passed, `segmentation_maps` is treated as an + instance segmentation map where each pixel represents an instance id. Can be provided as a single + dictionary with a global/dataset-level mapping or as a list of dictionaries (one per image), to map + instance ids in each image separately. + + return_tensors (`str` or [`~file_utils.TensorType`], *optional*): + If set, will return tensors instead of NumPy arrays. If set to `'pt'`, return PyTorch `torch.Tensor` + objects. + + input_data_format (`ChannelDimension` or `str`, *optional*): + The channel dimension format of the input image. If not provided, it will be inferred. + + Returns: + [`BatchFeature`]: A [`BatchFeature`] with the following fields: + + - **pixel_values** -- Pixel values to be fed to a model. + - **mask_labels** -- Optional list of mask labels of shape `(labels, height, width)` to be fed to a model + (when `annotations` are provided). + - **class_labels** -- Optional list of class labels of shape `(labels)` to be fed to a model (when + `annotations` are provided). They identify the labels of `mask_labels`, e.g. the label of + `mask_labels[i][j]` if `class_labels[i][j]`. + """ + ignore_index = self.ignore_index if ignore_index is None else ignore_index + + pixel_values_list = [to_numpy_array(pixel_values) for pixel_values in pixel_values_list] + + if input_data_format is None: + input_data_format = infer_channel_dimension_format(pixel_values_list[0]) + + encoded_inputs = BatchFeature({"pixel_values": pixel_values_list}, tensor_type=return_tensors) + + if segmentation_maps is not None: + mask_labels = [] + class_labels = [] + # Convert to list of binary masks and labels + for idx, segmentation_map in enumerate(segmentation_maps): + segmentation_map = to_numpy_array(segmentation_map) + if isinstance(instance_id_to_semantic_id, list): + instance_id = instance_id_to_semantic_id[idx] + else: + instance_id = instance_id_to_semantic_id + # Use instance2class_id mapping per image + masks, classes = convert_segmentation_map_to_binary_masks( + segmentation_map, + instance_id, + ignore_index=ignore_index, + ) + + mask_labels.append(torch.from_numpy(masks)) + class_labels.append(torch.from_numpy(classes)) + + # we cannot batch them since they don't share a common class size + encoded_inputs["mask_labels"] = mask_labels + encoded_inputs["class_labels"] = class_labels + + return encoded_inputs + + def merge_image_patches( + self, + segmentation_logits: torch.Tensor, + patch_offsets: list[tuple[int, int, int]], + target_sizes: list[tuple[int, int]], + size: dict[str, int], + ) -> list[torch.Tensor]: + """ + Reconstructs full-size semantic segmentation logits from patch predictions. + + Args: + segmentation_logits (`torch.Tensor`): + A tensor of shape `(num_patches, num_classes, patch_height, patch_width)` representing predicted logits + for each image patch. + patch_offsets (`list[tuple[int, int, int]]`): + A list of tuples where each tuple contains: + - `image_index` (int): Index of the original image this patch belongs to. + - `start` (int): Start pixel index of the patch along the long dimension (height or width). + - `end` (int): End pixel index of the patch along the long dimension. + target_sizes (`list[tuple[int, int]]`): + list of original (height, width) dimensions for each image before preprocessing. + size (`dict[str, int]`): + A size dict which was used to resize. + """ + num_classes = segmentation_logits.shape[1] + aggregated_logits = [] + patch_counts = [] + + for image_size in target_sizes: + height, width = get_size_with_aspect_ratio(image_size, size["shortest_edge"], size["longest_edge"]) + aggregated_logits.append(torch.zeros((num_classes, height, width), device=segmentation_logits.device)) + patch_counts.append(torch.zeros((num_classes, height, width), device=segmentation_logits.device)) + + # Stitch patches back into full-sized logit maps + for patch_idx, (image_idx, patch_start, patch_end) in enumerate(patch_offsets): + if target_sizes[image_idx][0] > target_sizes[image_idx][1]: + aggregated_logits[image_idx][:, patch_start:patch_end, :] += segmentation_logits[patch_idx] + patch_counts[image_idx][:, patch_start:patch_end, :] += 1 + else: + aggregated_logits[image_idx][:, :, patch_start:patch_end] += segmentation_logits[patch_idx] + patch_counts[image_idx][:, :, patch_start:patch_end] += 1 + + # Normalize and resize logits to original image size + reconstructed_logits = [] + for idx, (logit_sum, count) in enumerate(zip(aggregated_logits, patch_counts)): + averaged_logits = logit_sum / count.clamp(min=1) + resized_logits = F.interpolate( + averaged_logits[None, ...], + size=target_sizes[idx], + mode="bilinear", + align_corners=False, + )[0] + + reconstructed_logits.append(resized_logits) + + return reconstructed_logits + + def unpad_image( + self, + segmentation_logits: torch.Tensor, + target_sizes: list[tuple[int, int]], + size: dict[str, int], + ) -> list[torch.Tensor]: + """Restores panoptic segmentation logits to their original image resolutions.""" + + resized_logits = [] + + for idx, original_size in enumerate(target_sizes): + target_height, target_width = get_size_with_aspect_ratio( + original_size, size["shortest_edge"], size["longest_edge"] + ) + cropped_logits = segmentation_logits[idx][:, :target_height, :target_width] + upsampled_logits = F.interpolate( + cropped_logits[None, ...], size=original_size, mode="bilinear", align_corners=False + )[0] + resized_logits.append(upsampled_logits) + return resized_logits + + def post_process_semantic_segmentation( + self, + outputs, + target_sizes: list[tuple[int, int]], + size: Optional[dict[str, int]] = None, + ) -> np.ndarray: + """Post-processes model outputs into final semantic segmentation prediction.""" + + size = size if size is not None else self.size + + masks_queries_logits = outputs.masks_queries_logits # [batch_size, num_queries, height, width] + class_queries_logits = outputs.class_queries_logits # [batch_size, num_queries, num_classes+1] + patch_offsets = outputs.patch_offsets + + output_size = get_target_size(size) + masks_queries_logits = F.interpolate( + masks_queries_logits, + size=output_size, + mode="bilinear", + ) + + # Remove the null class `[..., :-1]` + masks_classes = class_queries_logits.softmax(dim=-1)[..., :-1] + masks_probs = masks_queries_logits.sigmoid() # [batch_size, num_queries, height, width] + + segmentation_logits = torch.einsum("bqc, bqhw -> bchw", masks_classes, masks_probs) + + output_logits = self.merge_image_patches(segmentation_logits, patch_offsets, target_sizes, size) + + preds = [logit.argmax(dim=0) for logit in output_logits] + return preds + + def post_process_panoptic_segmentation( + self, + outputs, + target_sizes: list[tuple[int, int]], + threshold: float = 0.8, + mask_threshold: float = 0.5, + overlap_mask_area_threshold: float = 0.8, + stuff_classes: Optional[list[int]] = None, + size: Optional[dict[str, int]] = None, + ): + """Post-processes model outputs into final panoptic segmentation prediction.""" + + size = size if size is not None else self.size + + masks_queries_logits = outputs.masks_queries_logits # [batch_size, num_queries, height, width] + class_queries_logits = outputs.class_queries_logits # [batch_size, num_queries, num_classes+1] + + batch_size = class_queries_logits.shape[0] + num_labels = class_queries_logits.shape[-1] - 1 + + output_size = get_target_size(size) + masks_queries_logits = F.interpolate( + masks_queries_logits, + size=output_size, + mode="bilinear", + ) + + mask_probs_batch = self.unpad_image(masks_queries_logits, target_sizes, size) + pred_scores_batch, pred_labels_batch = class_queries_logits.softmax(dim=-1).max(-1) + + results: list = [] + + for i in range(batch_size): + mask_probs, pred_scores, pred_labels = remove_low_and_no_objects( + mask_probs_batch[i], pred_scores_batch[i], pred_labels_batch[i], threshold, num_labels + ) + + # No mask found + if mask_probs.shape[0] <= 0: + height, width = target_sizes[i] if target_sizes is not None else mask_probs.shape[1:] + segmentation = torch.zeros((height, width)) - 1 + results.append({"segmentation": segmentation, "segments_info": []}) + continue + + segmentation, segments = compute_segments( + mask_probs=mask_probs, + pred_scores=pred_scores, + pred_labels=pred_labels, + stuff_classes=stuff_classes, + mask_threshold=mask_threshold, + overlap_mask_area_threshold=overlap_mask_area_threshold, + target_size=target_sizes[i] if target_sizes is not None else None, + ) + + results.append({"segmentation": segmentation, "segments_info": segments}) + return results + + @filter_out_non_signature_kwargs() + def post_process_instance_segmentation( + self, + outputs, + target_sizes: list[tuple[int, int]], + threshold: float = 0.5, + size: Optional[dict[str, int]] = None, + ): + """Post-processes model outputs into Instance Segmentation Predictions.""" + + size = size if size is not None else self.size + + class_queries_logits = outputs.class_queries_logits + masks_queries_logits = outputs.masks_queries_logits + + output_size = get_target_size(size) + masks_queries_logits = F.interpolate( + masks_queries_logits, + size=output_size, + mode="bilinear", + ) + + mask_probs_batch = self.unpad_image(masks_queries_logits, target_sizes, size) + + device = masks_queries_logits.device + batch_size = class_queries_logits.shape[0] + num_queries = class_queries_logits.shape[-2] + + results = [] + + for i in range(batch_size): + mask_pred = mask_probs_batch[i] + mask_class = class_queries_logits[i] + + # Remove the null class `[..., :-1]` + scores, pred_classes = mask_class.softmax(dim=-1)[..., :-1].max(-1) + pred_masks = (mask_pred > 0).float() + + # Calculate average mask prob + mask_scores = (mask_pred.sigmoid().flatten(1) * pred_masks.flatten(1)).sum(1) / ( + pred_masks.flatten(1).sum(1) + 1e-6 + ) + pred_scores = scores * mask_scores + + segmentation = torch.zeros(target_sizes[i], device=device) - 1 + + instance_maps, segments = [], [] + current_segment_id = 0 + for j in range(num_queries): + score = pred_scores[j].item() + + if not torch.all(pred_masks[j] == 0) and score >= threshold: + segmentation[pred_masks[j] == 1] = current_segment_id + segments.append( + { + "id": current_segment_id, + "label_id": pred_classes[j].item(), + "score": round(score, 6), + } + ) + current_segment_id += 1 + instance_maps.append(pred_masks[j]) + + results.append({"segmentation": segmentation, "segments_info": segments}) + return results + + +__all__ = ["EomtImageProcessor"] diff --git a/transformers/src/transformers/models/eomt/image_processing_eomt_fast.py b/transformers/src/transformers/models/eomt/image_processing_eomt_fast.py new file mode 100644 index 0000000000000000000000000000000000000000..343c6ae2cf1aefa02daed7a382196e8a6f94dfbe --- /dev/null +++ b/transformers/src/transformers/models/eomt/image_processing_eomt_fast.py @@ -0,0 +1,582 @@ +# coding=utf-8 +# Copyright 2025 Mobile Perception Systems Lab at TU/e and The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Fast Image processor class for EoMT.""" + +import math +from typing import Optional, Union + +import numpy as np + +from ...image_processing_utils import BatchFeature +from ...image_processing_utils_fast import ( + BaseImageProcessorFast, + DefaultFastImageProcessorKwargs, + group_images_by_shape, + reorder_images, +) +from ...image_utils import ( + IMAGENET_DEFAULT_MEAN, + IMAGENET_DEFAULT_STD, + ChannelDimension, + ImageInput, + PILImageResampling, + SizeDict, + make_list_of_images, + pil_torch_interpolation_mapping, + validate_kwargs, +) +from ...processing_utils import Unpack +from ...utils import ( + TensorType, + auto_docstring, + filter_out_non_signature_kwargs, + is_torch_available, + is_torchvision_available, + is_torchvision_v2_available, +) +from .image_processing_eomt import ( + compute_segments, + convert_segmentation_map_to_binary_masks, + get_size_with_aspect_ratio, + remove_low_and_no_objects, +) + + +if is_torch_available(): + import torch + +if is_torchvision_available(): + if is_torchvision_v2_available(): + from torchvision.transforms.v2 import functional as F + else: + from torchvision.transforms import functional as F + + +class EomtImageProcessorFastKwargs(DefaultFastImageProcessorKwargs): + """ + do_split_image (`bool`, *optional*, defaults to `False`): + Whether to split the input images into overlapping patches for semantic segmentation. If set to `True`, the + input images will be split into patches of size `size["shortest_edge"]` with an overlap between patches. + Otherwise, the input images will be padded to the target size. + do_pad (`bool`, *optional*, defaults to `False`): + Whether to pad the image. If `True`, will pad the patch dimension of the images in the batch to the largest + number of patches in the batch. Padding will be applied to the bottom and right with zeros. + ignore_index (`int`, *optional*): + Label to be assigned to background pixels in segmentation maps. If provided, segmentation map pixels + denoted with 0 (background) will be replaced with `ignore_index`. + """ + + do_split_image: bool + do_pad: bool + ignore_index: Optional[int] = None + + +def get_target_size(size_dict: dict[str, int]) -> tuple[int, int]: + """Returns the height and width from a size dict.""" + target_height = size_dict["shortest_edge"] + target_width = size_dict["longest_edge"] or target_height + + return target_height, target_width + + +def reorder_patches_and_offsets( + patches: list[torch.Tensor], offsets: list[list[int]] +) -> tuple[list[torch.Tensor], list[list[int]]]: + """Sorts patches and offsets according to the original image index.""" + + combined = list(zip(offsets, patches)) + combined.sort(key=lambda x: x[0][0]) + sorted_offsets, sorted_patches = zip(*combined) + + return list(sorted_patches), list(sorted_offsets) + + +@auto_docstring +class EomtImageProcessorFast(BaseImageProcessorFast): + resample = PILImageResampling.BILINEAR + image_mean = IMAGENET_DEFAULT_MEAN + image_std = IMAGENET_DEFAULT_STD + size = {"shortest_edge": 640, "longest_edge": 640} + default_to_square = False + do_resize = True + do_rescale = True + do_normalize = True + do_split_image = False + do_pad = False + ignore_index = None + valid_kwargs = EomtImageProcessorFastKwargs + + def __init__(self, **kwargs: Unpack[EomtImageProcessorFastKwargs]): + super().__init__(**kwargs) + + def _split_image(self, images: torch.Tensor, size: dict, image_indices: int) -> tuple[list, list]: + """Slices an image into overlapping patches for semantic segmentation.""" + + patches, patch_offsets = [], [] + + _, _, height, width = images.shape + patch_size = size["shortest_edge"] + + longer_side = max(height, width) + num_patches = math.ceil(longer_side / patch_size) + total_overlap = num_patches * patch_size - longer_side + overlap_per_patch = total_overlap / (num_patches - 1) if num_patches > 1 else 0 + + for i in range(num_patches): + start = int(i * (patch_size - overlap_per_patch)) + end = start + patch_size + + if height > width: + batch_patch = images[:, :, start:end, :] + else: + batch_patch = images[:, :, :, start:end] + + for batch_idx, single in enumerate(torch.unbind(batch_patch, dim=0)): + patches.append(single) + patch_offsets.append([image_indices[batch_idx], start, end]) + + return patches, patch_offsets + + def _pad(self, images: torch.Tensor, size: dict) -> torch.Tensor: + """Pads the image to the target size using zero padding.""" + _, _, height, width = images.shape + + target_height, target_width = get_target_size(size) + pad_h = max(0, target_height - height) + pad_w = max(0, target_width - width) + padding = (0, pad_w, 0, pad_h) + + padded_images = torch.nn.functional.pad(images, padding, mode="constant", value=0.0) + return padded_images + + def _preprocess( + self, + images: list["torch.Tensor"], + do_resize: bool, + size: SizeDict, + interpolation: Optional["F.InterpolationMode"], + do_rescale: bool, + rescale_factor: float, + do_normalize: bool, + do_split_image: bool, + do_pad: bool, + image_mean: Optional[Union[float, list[float]]], + image_std: Optional[Union[float, list[float]]], + disable_grouping: Optional[bool], + return_tensors: Optional[Union[str, TensorType]], + **kwargs, + ): + """Preprocesses the input images and masks if provided.""" + processed_images, patch_offsets = [], [] + + grouped_images, grouped_images_index = group_images_by_shape(images, disable_grouping=disable_grouping) + resized_images_grouped = {} + + for shape, stacked_images in grouped_images.items(): + if do_resize: + stacked_images = self.resize(image=stacked_images, size=size, interpolation=interpolation) + resized_images_grouped[shape] = stacked_images + images = reorder_images(resized_images_grouped, grouped_images_index) + + # Group images by size for batched resizing, Needed in case do_resize is False. + grouped_images, grouped_images_index = group_images_by_shape(images, disable_grouping=disable_grouping) + processed_images_grouped = {} + + for shape, stacked_images in grouped_images.items(): + original_indices = [ + original_idx for original_idx, (img_shape, _) in grouped_images_index.items() if img_shape == shape + ] + + if do_split_image: + patches, offsets = self._split_image(stacked_images, size, original_indices) + processed_images.extend(patches) + patch_offsets.extend(offsets) + + if do_pad: + stacked_images = self._pad(stacked_images, size) + processed_images_grouped[shape] = stacked_images + + if do_split_image: + images, patch_offsets = reorder_patches_and_offsets(processed_images, patch_offsets) + + if do_pad: + images = reorder_images(processed_images_grouped, grouped_images_index) + + grouped_images, grouped_images_index = group_images_by_shape(images, disable_grouping=disable_grouping) + processed_images_grouped = {} + + for shape, stacked_images in grouped_images.items(): + stacked_images = self.rescale_and_normalize( + stacked_images, do_rescale, rescale_factor, do_normalize, image_mean, image_std + ) + processed_images_grouped[shape] = stacked_images + images = reorder_images(processed_images_grouped, grouped_images_index) + + processed_images = torch.stack(images, dim=0) if return_tensors else images + + return processed_images, patch_offsets + + def _preprocess_images(self, images, **kwargs): + """Preprocesses the input images.""" + return self._preprocess(images, **kwargs) + + def _preprocess_masks(self, segmentation_maps: list[torch.Tensor], **kwargs): + """Preprocesses segmentation maps.""" + processed_segmentation_maps = [] + for segmentation_map in segmentation_maps: + segmentation_map = self._process_image( + segmentation_map, do_convert_rgb=False, input_data_format=ChannelDimension.FIRST + ) + + if segmentation_map.ndim == 2: + segmentation_map = segmentation_map[None, ...] + + processed_segmentation_maps.append(segmentation_map) + + kwargs["do_normalize"] = False + kwargs["do_rescale"] = False + kwargs["input_data_format"] = ChannelDimension.FIRST + + # Nearest interpolation is used for segmentation maps instead of BILINEAR. + kwargs["interpolation"] = pil_torch_interpolation_mapping[PILImageResampling.NEAREST] + + processed_segmentation_maps, _ = self._preprocess(images=processed_segmentation_maps, **kwargs) + processed_segmentation_maps = processed_segmentation_maps.squeeze(1) + processed_segmentation_maps = processed_segmentation_maps.to(torch.int64) + + return processed_segmentation_maps + + @auto_docstring + def preprocess( + self, + images: ImageInput, + segmentation_maps: Optional[list[torch.Tensor]] = None, + instance_id_to_semantic_id: Optional[dict[int, int]] = None, + **kwargs: Unpack[EomtImageProcessorFastKwargs], + ) -> BatchFeature: + r""" + segmentation_maps (`ImageInput`, *optional*): + The segmentation maps to preprocess for corresponding images. + instance_id_to_semantic_id (`list[dict[int, int]]` or `dict[int, int]`, *optional*): + A mapping between object instance ids and class ids. + """ + # args are not validated, but their order in the `preprocess` and `_preprocess` signatures must be the same + validate_kwargs(captured_kwargs=kwargs.keys(), valid_processor_keys=self._valid_kwargs_names) + # Set default kwargs from self. This ensures that if a kwarg is not provided + # by the user, it gets its default value from the instance, or is set to None. + for kwarg_name in self._valid_kwargs_names: + kwargs.setdefault(kwarg_name, getattr(self, kwarg_name, None)) + + # Extract parameters that are only used for preparing the input images + do_convert_rgb = kwargs.pop("do_convert_rgb") + input_data_format = kwargs.pop("input_data_format") + device = kwargs.pop("device") + # Prepare input images + images = self._prepare_input_images( + images=images, do_convert_rgb=do_convert_rgb, input_data_format=input_data_format, device=device + ) + # Prepare segmentation maps + if segmentation_maps is not None: + segmentation_maps = make_list_of_images(images=segmentation_maps, expected_ndims=2) + + # Update kwargs that need further processing before being validated + kwargs = self._further_process_kwargs(**kwargs) + + # Validate kwargs + self._validate_preprocess_kwargs(**kwargs) + + # torch resize uses interpolation instead of resample + resample = kwargs.pop("resample") + + # Check if resample is an int before checking if it's an instance of PILImageResampling + # because if pillow < 9.1.0, resample is an int and PILImageResampling is a module. + # Checking PILImageResampling will fail with error `TypeError: isinstance() arg 2 must be a type or tuple of types`. + kwargs["interpolation"] = ( + pil_torch_interpolation_mapping[resample] if isinstance(resample, (int, PILImageResampling)) else resample + ) + + # Pop kwargs that are not needed in _preprocess + kwargs.pop("default_to_square") + kwargs.pop("data_format") + + ignore_index = kwargs.pop("ignore_index", None) + + processed_images, patch_offsets = self._preprocess_images(images=images, **kwargs) + + outputs = BatchFeature({"pixel_values": processed_images}) + + mask_labels, class_labels = [], [] + if segmentation_maps is not None: + segmentation_maps = self._preprocess_masks(segmentation_maps=segmentation_maps, **kwargs) + # Convert to list of binary masks and labels + for idx, segmentation_map in enumerate(segmentation_maps): + if isinstance(instance_id_to_semantic_id, list): + instance_id = instance_id_to_semantic_id[idx] + else: + instance_id = instance_id_to_semantic_id + # Use instance2class_id mapping per image + masks, classes = convert_segmentation_map_to_binary_masks( + segmentation_map, + instance_id, + ignore_index=ignore_index, + ) + + mask_labels.append(torch.from_numpy(masks)) + class_labels.append(torch.from_numpy(classes)) + + # we cannot batch them since they don't share a common class size + outputs["mask_labels"] = mask_labels + outputs["class_labels"] = class_labels + + if patch_offsets: + outputs["patch_offsets"] = [torch.tensor(offsets) for offsets in patch_offsets] + + return outputs + + def merge_image_patches( + self, + segmentation_logits: torch.Tensor, + patch_offsets: list[tuple[int, int, int]], + target_sizes: list[tuple[int, int]], + size: dict[str, int], + ) -> list[torch.Tensor]: + """ + Reconstructs full-size semantic segmentation logits from patch predictions. + + Args: + segmentation_logits (`torch.Tensor`): + A tensor of shape `(num_patches, num_classes, patch_height, patch_width)` representing predicted logits + for each image patch. + patch_offsets (`list[tuple[int, int, int]]`): + A list of tuples where each tuple contains: + - `image_index` (int): Index of the original image this patch belongs to. + - `start` (int): Start pixel index of the patch along the long dimension (height or width). + - `end` (int): End pixel index of the patch along the long dimension. + target_sizes (`list[tuple[int, int]]`): + list of original (height, width) dimensions for each image before preprocessing. + size (`dict[str, int]`): + A size dict which was used to resize. + """ + num_classes = segmentation_logits.shape[1] + aggregated_logits = [] + patch_counts = [] + + for image_size in target_sizes: + height, width = get_size_with_aspect_ratio(image_size, size["shortest_edge"], size["longest_edge"]) + aggregated_logits.append(torch.zeros((num_classes, height, width), device=segmentation_logits.device)) + patch_counts.append(torch.zeros((num_classes, height, width), device=segmentation_logits.device)) + + # Stitch patches back into full-sized logit maps + for patch_idx, (image_idx, patch_start, patch_end) in enumerate(patch_offsets): + if target_sizes[image_idx][0] > target_sizes[image_idx][1]: + aggregated_logits[image_idx][:, patch_start:patch_end, :] += segmentation_logits[patch_idx] + patch_counts[image_idx][:, patch_start:patch_end, :] += 1 + else: + aggregated_logits[image_idx][:, :, patch_start:patch_end] += segmentation_logits[patch_idx] + patch_counts[image_idx][:, :, patch_start:patch_end] += 1 + + # Normalize and resize logits to original image size + reconstructed_logits = [] + for idx, (logit_sum, count) in enumerate(zip(aggregated_logits, patch_counts)): + averaged_logits = logit_sum / count.clamp(min=1) + resized_logits = torch.nn.functional.interpolate( + averaged_logits[None, ...], + size=target_sizes[idx], + mode="bilinear", + align_corners=False, + )[0] + + reconstructed_logits.append(resized_logits) + + return reconstructed_logits + + def unpad_image( + self, + segmentation_logits: torch.Tensor, + target_sizes: list[tuple[int, int]], + size: dict[str, int], + ) -> list[torch.Tensor]: + """Restores panoptic segmentation logits to their original image resolutions.""" + + resized_logits = [] + + for idx, original_size in enumerate(target_sizes): + target_height, target_width = get_size_with_aspect_ratio( + original_size, size["shortest_edge"], size["longest_edge"] + ) + cropped_logits = segmentation_logits[idx][:, :target_height, :target_width] + upsampled_logits = torch.nn.functional.interpolate( + cropped_logits[None, ...], size=original_size, mode="bilinear", align_corners=False + )[0] + resized_logits.append(upsampled_logits) + return resized_logits + + def post_process_semantic_segmentation( + self, + outputs, + target_sizes: list[tuple[int, int]], + size: Optional[dict[str, int]] = None, + ) -> np.ndarray: + """Post-processes model outputs into final semantic segmentation prediction.""" + + size = size if size is not None else self.size + + masks_queries_logits = outputs.masks_queries_logits # [batch_size, num_queries, height, width] + class_queries_logits = outputs.class_queries_logits # [batch_size, num_queries, num_classes+1] + patch_offsets = outputs.patch_offsets + + output_size = get_target_size(size) + masks_queries_logits = torch.nn.functional.interpolate( + masks_queries_logits, + size=output_size, + mode="bilinear", + ) + + # Remove the null class `[..., :-1]` + masks_classes = class_queries_logits.softmax(dim=-1)[..., :-1] + masks_probs = masks_queries_logits.sigmoid() # [batch_size, num_queries, height, width] + + segmentation_logits = torch.einsum("bqc, bqhw -> bchw", masks_classes, masks_probs) + + output_logits = self.merge_image_patches(segmentation_logits, patch_offsets, target_sizes, size) + + preds = [logit.argmax(dim=0) for logit in output_logits] + return preds + + def post_process_panoptic_segmentation( + self, + outputs, + target_sizes: list[tuple[int, int]], + threshold: float = 0.8, + mask_threshold: float = 0.5, + overlap_mask_area_threshold: float = 0.8, + stuff_classes: Optional[list[int]] = None, + size: Optional[dict[str, int]] = None, + ): + """Post-processes model outputs into final panoptic segmentation prediction.""" + + size = size if size is not None else self.size + + masks_queries_logits = outputs.masks_queries_logits # [batch_size, num_queries, height, width] + class_queries_logits = outputs.class_queries_logits # [batch_size, num_queries, num_classes+1] + + batch_size = class_queries_logits.shape[0] + num_labels = class_queries_logits.shape[-1] - 1 + + output_size = get_target_size(size) + masks_queries_logits = torch.nn.functional.interpolate( + masks_queries_logits, + size=output_size, + mode="bilinear", + ) + + mask_probs_batch = self.unpad_image(masks_queries_logits, target_sizes, size) + pred_scores_batch, pred_labels_batch = class_queries_logits.softmax(dim=-1).max(-1) + + results: list = [] + + for i in range(batch_size): + mask_probs, pred_scores, pred_labels = remove_low_and_no_objects( + mask_probs_batch[i], pred_scores_batch[i], pred_labels_batch[i], threshold, num_labels + ) + + # No mask found + if mask_probs.shape[0] <= 0: + height, width = target_sizes[i] if target_sizes is not None else mask_probs.shape[1:] + segmentation = torch.zeros((height, width)) - 1 + results.append({"segmentation": segmentation, "segments_info": []}) + continue + + segmentation, segments = compute_segments( + mask_probs=mask_probs, + pred_scores=pred_scores, + pred_labels=pred_labels, + stuff_classes=stuff_classes, + mask_threshold=mask_threshold, + overlap_mask_area_threshold=overlap_mask_area_threshold, + target_size=target_sizes[i] if target_sizes is not None else None, + ) + + results.append({"segmentation": segmentation, "segments_info": segments}) + return results + + @filter_out_non_signature_kwargs() + def post_process_instance_segmentation( + self, + outputs, + target_sizes: list[tuple[int, int]], + threshold: float = 0.8, + size: Optional[dict[str, int]] = None, + ): + """Post-processes model outputs into Instance Segmentation Predictions.""" + + size = size if size is not None else self.size + + masks_queries_logits = outputs.masks_queries_logits + class_queries_logits = outputs.class_queries_logits + + output_size = get_target_size(size) + masks_queries_logits = torch.nn.functional.interpolate( + masks_queries_logits, + size=output_size, + mode="bilinear", + ) + + mask_probs_batch = self.unpad_image(masks_queries_logits, target_sizes, size) + + device = masks_queries_logits.device + batch_size = class_queries_logits.shape[0] + num_queries = class_queries_logits.shape[-2] + + results = [] + + for i in range(batch_size): + mask_pred = mask_probs_batch[i] + mask_class = class_queries_logits[i] + + # Remove the null class `[..., :-1]` + scores, pred_classes = mask_class.softmax(dim=-1)[..., :-1].max(-1) + pred_masks = (mask_pred > 0).float() + + # Calculate average mask prob + mask_scores = (mask_pred.sigmoid().flatten(1) * pred_masks.flatten(1)).sum(1) / ( + pred_masks.flatten(1).sum(1) + 1e-6 + ) + pred_scores = scores * mask_scores + + segmentation = torch.zeros(target_sizes[i], device=device) - 1 + + instance_maps, segments = [], [] + current_segment_id = 0 + for j in range(num_queries): + score = pred_scores[j].item() + + if not torch.all(pred_masks[j] == 0) and score >= threshold: + segmentation[pred_masks[j] == 1] = current_segment_id + segments.append( + { + "id": current_segment_id, + "label_id": pred_classes[j].item(), + "score": round(score, 6), + } + ) + current_segment_id += 1 + instance_maps.append(pred_masks[j]) + + results.append({"segmentation": segmentation, "segments_info": segments}) + return results + + +__all__ = ["EomtImageProcessorFast"] diff --git a/transformers/src/transformers/models/eomt/modeling_eomt.py b/transformers/src/transformers/models/eomt/modeling_eomt.py new file mode 100644 index 0000000000000000000000000000000000000000..025d28350fffaf3ba4c2a82922057b60e3ef9b35 --- /dev/null +++ b/transformers/src/transformers/models/eomt/modeling_eomt.py @@ -0,0 +1,1250 @@ +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +# This file was automatically generated from src/transformers/models/eomt/modular_eomt.py. +# Do NOT edit this file manually as any edits will be overwritten by the generation of +# the file from the modular. If any change should be done, please apply the change to the +# modular_eomt.py file directly. One of our CI enforces this. +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +# coding=utf-8 +# Copyright 2025 Mobile Perception Systems Lab at TU/e and The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import collections.abc +import math +from dataclasses import dataclass +from typing import Callable, Optional, Union + +import numpy as np +import torch +import torch.nn.functional as F +from torch import Tensor, nn + +from ...activations import ACT2FN +from ...file_utils import ModelOutput, is_scipy_available, requires_backends +from ...modeling_layers import GradientCheckpointingLayer +from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel +from ...utils import auto_docstring, can_return_tuple, is_accelerate_available +from .configuration_eomt import EomtConfig + + +if is_scipy_available(): + from scipy.optimize import linear_sum_assignment + +if is_accelerate_available(): + from accelerate import PartialState + from accelerate.utils import reduce + + +@dataclass +@auto_docstring( + custom_intro=""" + Class for outputs of [`EomtForUniversalSegmentationOutput`]. + + This output can be directly passed to [`~EomtImageProcessor.post_process_semantic_segmentation`] or + [`~EomtImageProcessor.post_process_instance_segmentation`] or + [`~EomtImageProcessor.post_process_panoptic_segmentation`] to compute final segmentation maps. Please, see + [`~EomtImageProcessor] for details regarding usage. + """ +) +class EomtForUniversalSegmentationOutput(ModelOutput): + r""" + loss (`torch.Tensor`, *optional*): + The computed loss, returned when labels are present. + class_queries_logits (`torch.FloatTensor`): + A tensor of shape `(batch_size, num_queries, num_labels + 1)` representing the proposed classes for each + query. Note the `+ 1` is needed because we incorporate the null class. + masks_queries_logits (`torch.FloatTensor`): + A tensor of shape `(batch_size, num_queries, height, width)` representing the proposed masks for each + query. + last_hidden_state (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`): + Last hidden states (final feature map) of the last layer. + hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of + shape `(batch_size, sequence_length, hidden_size)`. Hidden-states all layers of the model. + attentions (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `tuple(torch.FloatTensor)` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. Self and Cross Attentions weights from transformer decoder. + patch_offsets (`list[torch.Tensor]`, *optional*): + list of tuples indicating the image index and start and end positions of patches for semantic segementation. + """ + + loss: Optional[torch.FloatTensor] = None + class_queries_logits: Optional[torch.FloatTensor] = None + masks_queries_logits: Optional[torch.FloatTensor] = None + last_hidden_state: Optional[torch.FloatTensor] = None + hidden_states: Optional[tuple[torch.FloatTensor]] = None + attentions: Optional[tuple[torch.FloatTensor]] = None + patch_offsets: Optional[list[torch.Tensor]] = None + + +# Adapted from https://github.com/facebookresearch/detectron2/blob/main/projects/PointRend/point_rend/point_features.py +def sample_point( + input_features: torch.Tensor, point_coordinates: torch.Tensor, add_dim=False, **kwargs +) -> torch.Tensor: + """ + A wrapper around `torch.nn.functional.grid_sample` to support 3D point_coordinates tensors. + + Args: + input_features (`torch.Tensor` of shape (batch_size, channels, height, width)): + A tensor that contains features map on a height * width grid + point_coordinates (`torch.Tensor` of shape (batch_size, num_points, 2) or (batch_size, grid_height, grid_width,: + 2)): + A tensor that contains [0, 1] * [0, 1] normalized point coordinates + add_dim (`bool`): + boolean value to keep track of added dimension + + Returns: + point_features (`torch.Tensor` of shape (batch_size, channels, num_points) or (batch_size, channels, + height_grid, width_grid): + A tensor that contains features for points in `point_coordinates`. + """ + if point_coordinates.dim() == 3: + add_dim = True + point_coordinates = point_coordinates.unsqueeze(2) + + # use nn.function.grid_sample to get features for points in `point_coordinates` via bilinear interpolation + point_features = torch.nn.functional.grid_sample(input_features, 2.0 * point_coordinates - 1.0, **kwargs) + if add_dim: + point_features = point_features.squeeze(3) + + return point_features + + +def pair_wise_dice_loss(inputs: Tensor, labels: Tensor) -> Tensor: + """ + A pair wise version of the dice loss, see `dice_loss` for usage. + + Args: + inputs (`torch.Tensor`): + A tensor representing a mask + labels (`torch.Tensor`): + A tensor with the same shape as inputs. Stores the binary classification labels for each element in inputs + (0 for the negative class and 1 for the positive class). + + Returns: + `torch.Tensor`: The computed loss between each pairs. + """ + inputs = inputs.sigmoid().flatten(1) + numerator = 2 * torch.matmul(inputs, labels.T) + # using broadcasting to get a [num_queries, NUM_CLASSES] matrix + denominator = inputs.sum(-1)[:, None] + labels.sum(-1)[None, :] + loss = 1 - (numerator + 1) / (denominator + 1) + return loss + + +def pair_wise_sigmoid_cross_entropy_loss(inputs: torch.Tensor, labels: torch.Tensor) -> torch.Tensor: + r""" + A pair wise version of the cross entropy loss, see `sigmoid_cross_entropy_loss` for usage. + + Args: + inputs (`torch.Tensor`): + A tensor representing a mask. + labels (`torch.Tensor`): + A tensor with the same shape as inputs. Stores the binary classification labels for each element in inputs + (0 for the negative class and 1 for the positive class). + + Returns: + loss (`torch.Tensor`): The computed loss between each pairs. + """ + + height_and_width = inputs.shape[1] + + criterion = nn.BCEWithLogitsLoss(reduction="none") + cross_entropy_loss_pos = criterion(inputs, torch.ones_like(inputs)) + cross_entropy_loss_neg = criterion(inputs, torch.zeros_like(inputs)) + + loss_pos = torch.matmul(cross_entropy_loss_pos / height_and_width, labels.T) + loss_neg = torch.matmul(cross_entropy_loss_neg / height_and_width, (1 - labels).T) + loss = loss_pos + loss_neg + return loss + + +# Adapted from https://github.com/facebookresearch/Eomt/blob/main/eomt/modeling/matcher.py +class EomtHungarianMatcher(nn.Module): + """This class computes an assignment between the labels and the predictions of the network. + + For efficiency reasons, the labels don't include the no_object. Because of this, in general, there are more + predictions than labels. In this case, we do a 1-to-1 matching of the best predictions, while the others are + un-matched (and thus treated as non-objects). + """ + + def __init__( + self, cost_class: float = 1.0, cost_mask: float = 1.0, cost_dice: float = 1.0, num_points: int = 12544 + ): + """Creates the matcher + + Params: + cost_class (`float`, *optional*, defaults to 1.0): + Relative weight of the classification error in the matching cost. + cost_mask (`float`, *optional*, defaults to 1.0): + This is the relative weight of the focal loss of the binary mask in the matching cost. + cost_dice (`float`, *optional*, defaults to 1.0): + This is the relative weight of the dice loss of the binary mask in the matching cost. + num_points (`int`, *optional*, defaults to 12544): + No. of points to sample on which the mask loss will be calculated. The same set of K points are + uniformly sampled for all prediction and ground truth masks to construct the cost matrix for bipartite + matching. + """ + super().__init__() + if cost_class == 0 and cost_mask == 0 and cost_dice == 0: + raise ValueError("All costs can't be 0") + + self.num_points = num_points + self.cost_class = cost_class + self.cost_mask = cost_mask + self.cost_dice = cost_dice + + @torch.no_grad() + def forward( + self, + masks_queries_logits: torch.Tensor, + class_queries_logits: torch.Tensor, + mask_labels: torch.Tensor, + class_labels: torch.Tensor, + ) -> list[tuple[Tensor]]: + """ + Params: + masks_queries_logits (`torch.Tensor`): + A tensor of dim `batch_size, num_queries, num_labels` with the classification logits. + class_queries_logits (`torch.Tensor`): + A tensor of dim `batch_size, num_queries, height, width` with the predicted masks. + class_labels (`torch.Tensor`): + A tensor of dim `num_target_boxes` (where num_target_boxes is the number of ground-truth objects in the + target) containing the class labels. + mask_labels (`torch.Tensor`): + A tensor of dim `num_target_boxes, height, width` containing the target masks. + + Returns: + matched_indices (`list[tuple[Tensor]]`): A list of size batch_size, containing tuples of (index_i, index_j) + where: + - index_i is the indices of the selected predictions (in order) + - index_j is the indices of the corresponding selected labels (in order) + For each batch element, it holds: + len(index_i) = len(index_j) = min(num_queries, num_target_boxes). + """ + indices: list[tuple[np.array]] = [] + + # iterate through batch size + batch_size = masks_queries_logits.shape[0] + for i in range(batch_size): + pred_probs = class_queries_logits[i].softmax(-1) + pred_mask = masks_queries_logits[i] + + # Compute the classification cost. Contrary to the loss, we don't use the NLL, but approximate it in 1 - proba[target class]. The 1 is a constant that doesn't change the matching, it can be omitted. + cost_class = -pred_probs[:, class_labels[i]] + target_mask = mask_labels[i].to(pred_mask) + target_mask = target_mask[:, None] + pred_mask = pred_mask[:, None] + + # Sample ground truth and predicted masks + point_coordinates = torch.rand(1, self.num_points, 2, device=pred_mask.device) + + target_coordinates = point_coordinates.repeat(target_mask.shape[0], 1, 1) + target_mask = sample_point(target_mask, target_coordinates, align_corners=False).squeeze(1) + + pred_coordinates = point_coordinates.repeat(pred_mask.shape[0], 1, 1) + pred_mask = sample_point(pred_mask, pred_coordinates, align_corners=False).squeeze(1) + + # compute the cross entropy loss between each mask pairs -> shape (num_queries, num_labels) + cost_mask = pair_wise_sigmoid_cross_entropy_loss(pred_mask, target_mask) + # Compute the dice loss between each mask pairs -> shape (num_queries, num_labels) + cost_dice = pair_wise_dice_loss(pred_mask, target_mask) + # final cost matrix + cost_matrix = self.cost_mask * cost_mask + self.cost_class * cost_class + self.cost_dice * cost_dice + # eliminate infinite values in cost_matrix to avoid the error ``ValueError: cost matrix is infeasible`` + cost_matrix = torch.minimum(cost_matrix, torch.tensor(1e10)) + cost_matrix = torch.maximum(cost_matrix, torch.tensor(-1e10)) + cost_matrix = torch.nan_to_num(cost_matrix, 0) + # do the assignment using the hungarian algorithm in scipy + assigned_indices: tuple[np.array] = linear_sum_assignment(cost_matrix.cpu()) + indices.append(assigned_indices) + + # It could be stacked in one tensor + matched_indices = [ + (torch.as_tensor(i, dtype=torch.int64), torch.as_tensor(j, dtype=torch.int64)) for i, j in indices + ] + return matched_indices + + +def dice_loss(inputs: Tensor, labels: Tensor, num_masks: int) -> Tensor: + r""" + Compute the DICE loss, similar to generalized IOU for masks as follows: + + $$ \mathcal{L}_{\text{dice}(x, y) = 1 - \frac{2 * x \cap y }{x \cup y + 1}} $$ + + In practice, since `labels` is a binary mask, (only 0s and 1s), dice can be computed as follow + + $$ \mathcal{L}_{\text{dice}(x, y) = 1 - \frac{2 * x * y }{x + y + 1}} $$ + + Args: + inputs (`torch.Tensor`): + A tensor representing a mask. + labels (`torch.Tensor`): + A tensor with the same shape as inputs. Stores the binary classification labels for each element in inputs + (0 for the negative class and 1 for the positive class). + num_masks (`int`): + The number of masks present in the current batch, used for normalization. + + Returns: + `torch.Tensor`: The computed loss. + """ + probs = inputs.sigmoid().flatten(1) + numerator = 2 * (probs * labels).sum(-1) + denominator = probs.sum(-1) + labels.sum(-1) + loss = 1 - (numerator + 1) / (denominator + 1) + loss = loss.sum() / num_masks + return loss + + +def sigmoid_cross_entropy_loss(inputs: torch.Tensor, labels: torch.Tensor, num_masks: int) -> torch.Tensor: + r""" + Args: + inputs (`torch.Tensor`): + A float tensor of arbitrary shape. + labels (`torch.Tensor`): + A tensor with the same shape as inputs. Stores the binary classification labels for each element in inputs + (0 for the negative class and 1 for the positive class). + + Returns: + loss (`torch.Tensor`): The computed loss. + """ + criterion = nn.BCEWithLogitsLoss(reduction="none") + cross_entropy_loss = criterion(inputs, labels) + + loss = cross_entropy_loss.mean(1).sum() / num_masks + return loss + + +# Adapted from https://github.com/facebookresearch/Eomt/blob/main/eomt/modeling/criterion.py +class EomtLoss(nn.Module): + def __init__(self, config: EomtConfig, weight_dict: dict[str, float]): + """ + The Eomt Loss. The loss is computed very similar to DETR. The process happens in two steps: 1) we + compute hungarian assignment between ground truth masks and the outputs of the model 2) we supervise each pair + of matched ground-truth / prediction (supervise class and mask) + + Args: + config (`EomtConfig`): + The configuration for Eomt model also containing loss calculation specific parameters. + weight_dict (`dict[str, float]`): + A dictionary of weights to be applied to the different losses. + """ + super().__init__() + requires_backends(self, ["scipy"]) + self.num_labels = config.num_labels + self.weight_dict = weight_dict + + # Weight to apply to the null class + self.eos_coef = config.no_object_weight + empty_weight = torch.ones(self.num_labels + 1) + empty_weight[-1] = self.eos_coef + self.register_buffer("empty_weight", empty_weight) + + # pointwise mask loss parameters + self.num_points = config.train_num_points + self.oversample_ratio = config.oversample_ratio + self.importance_sample_ratio = config.importance_sample_ratio + + self.matcher = EomtHungarianMatcher( + cost_class=config.class_weight, + cost_dice=config.dice_weight, + cost_mask=config.mask_weight, + num_points=self.num_points, + ) + + def _max_by_axis(self, sizes: list[list[int]]) -> list[int]: + maxes = sizes[0] + for sublist in sizes[1:]: + for index, item in enumerate(sublist): + maxes[index] = max(maxes[index], item) + return maxes + + # Adapted from nested_tensor_from_tensor_list() in original implementation + def _pad_images_to_max_in_batch(self, tensors: list[Tensor]) -> tuple[Tensor, Tensor]: + # get the maximum size in the batch + max_size = self._max_by_axis([list(tensor.shape) for tensor in tensors]) + # compute final size + batch_shape = [len(tensors)] + max_size + batch_size, _, height, width = batch_shape + dtype = tensors[0].dtype + device = tensors[0].device + padded_tensors = torch.zeros(batch_shape, dtype=dtype, device=device) + padding_masks = torch.ones((batch_size, height, width), dtype=torch.bool, device=device) + # pad the tensors to the size of the biggest one + for tensor, padded_tensor, padding_mask in zip(tensors, padded_tensors, padding_masks): + padded_tensor[: tensor.shape[0], : tensor.shape[1], : tensor.shape[2]].copy_(tensor) + padding_mask[: tensor.shape[1], : tensor.shape[2]] = False + + return padded_tensors, padding_masks + + def loss_labels( + self, class_queries_logits: Tensor, class_labels: list[Tensor], indices: tuple[np.array] + ) -> dict[str, Tensor]: + """Compute the losses related to the labels using cross entropy. + + Args: + class_queries_logits (`torch.Tensor`): + A tensor of shape `batch_size, num_queries, num_labels` + class_labels (`list[torch.Tensor]`): + List of class labels of shape `(labels)`. + indices (`tuple[np.array])`: + The indices computed by the Hungarian matcher. + + Returns: + `dict[str, Tensor]`: A dict of `torch.Tensor` containing the following key: + - **loss_cross_entropy** -- The loss computed using cross entropy on the predicted and ground truth labels. + """ + pred_logits = class_queries_logits + batch_size, num_queries, _ = pred_logits.shape + criterion = nn.CrossEntropyLoss(weight=self.empty_weight) + idx = self._get_predictions_permutation_indices(indices) # shape of (batch_size, num_queries) + target_classes_o = torch.cat( + [target[j] for target, (_, j) in zip(class_labels, indices)] + ) # shape of (batch_size, num_queries) + target_classes = torch.full( + (batch_size, num_queries), fill_value=self.num_labels, dtype=torch.int64, device=pred_logits.device + ) + target_classes[idx] = target_classes_o + # Permute target_classes (batch_size, num_queries, num_labels) -> (batch_size, num_labels, num_queries) + pred_logits_transposed = pred_logits.transpose(1, 2) + loss_ce = criterion(pred_logits_transposed, target_classes) + losses = {"loss_cross_entropy": loss_ce} + return losses + + def loss_masks( + self, + masks_queries_logits: torch.Tensor, + mask_labels: list[torch.Tensor], + indices: tuple[np.array], + num_masks: int, + ) -> dict[str, torch.Tensor]: + """Compute the losses related to the masks using sigmoid_cross_entropy_loss and dice loss. + + Args: + masks_queries_logits (`torch.Tensor`): + A tensor of shape `(batch_size, num_queries, height, width)`. + mask_labels (`torch.Tensor`): + List of mask labels of shape `(labels, height, width)`. + indices (`tuple[np.array])`: + The indices computed by the Hungarian matcher. + num_masks (`int)`: + The number of masks, used for normalization. + + Returns: + losses (`dict[str, Tensor]`): A dict of `torch.Tensor` containing two keys: + - **loss_mask** -- The loss computed using sigmoid cross entropy loss on the predicted and ground truth. + masks. + - **loss_dice** -- The loss computed using dice loss on the predicted on the predicted and ground truth, + masks. + """ + src_idx = self._get_predictions_permutation_indices(indices) + tgt_idx = self._get_targets_permutation_indices(indices) + # shape (batch_size * num_queries, height, width) + pred_masks = masks_queries_logits[src_idx] + # shape (batch_size, num_queries, height, width) + # pad all and stack the targets to the num_labels dimension + target_masks, _ = self._pad_images_to_max_in_batch(mask_labels) + target_masks = target_masks[tgt_idx] + + # No need to upsample predictions as we are using normalized coordinates + pred_masks = pred_masks[:, None] + target_masks = target_masks[:, None] + + # Sample point coordinates + with torch.no_grad(): + point_coordinates = self.sample_points_using_uncertainty( + pred_masks, + lambda logits: self.calculate_uncertainty(logits), + self.num_points, + self.oversample_ratio, + self.importance_sample_ratio, + ) + + point_labels = sample_point(target_masks, point_coordinates, align_corners=False).squeeze(1) + + point_logits = sample_point(pred_masks, point_coordinates, align_corners=False).squeeze(1) + + losses = { + "loss_mask": sigmoid_cross_entropy_loss(point_logits, point_labels, num_masks), + "loss_dice": dice_loss(point_logits, point_labels, num_masks), + } + + del pred_masks + del target_masks + return losses + + def _get_predictions_permutation_indices(self, indices): + # Permute predictions following indices + batch_indices = torch.cat([torch.full_like(src, i) for i, (src, _) in enumerate(indices)]) + predictions_indices = torch.cat([src for (src, _) in indices]) + return batch_indices, predictions_indices + + def _get_targets_permutation_indices(self, indices): + # Permute labels following indices + batch_indices = torch.cat([torch.full_like(tgt, i) for i, (_, tgt) in enumerate(indices)]) + target_indices = torch.cat([tgt for (_, tgt) in indices]) + return batch_indices, target_indices + + def calculate_uncertainty(self, logits: torch.Tensor) -> torch.Tensor: + """ + In Eomt paper, uncertainty is estimated as L1 distance between 0.0 and the logit prediction in 'logits' + for the foreground class in `classes`. + + Args: + logits (`torch.Tensor`): + A tensor of shape (R, 1, ...) for class-specific or class-agnostic, where R is the total number of predicted masks in all images and C is: + the number of foreground classes. The values are logits. + + Returns: + scores (`torch.Tensor`): A tensor of shape (R, 1, ...) that contains uncertainty scores with the most + uncertain locations having the highest uncertainty score. + """ + uncertainty_scores = -(torch.abs(logits)) + return uncertainty_scores + + def sample_points_using_uncertainty( + self, + logits: torch.Tensor, + uncertainty_function, + num_points: int, + oversample_ratio: int, + importance_sample_ratio: float, + ) -> torch.Tensor: + """ + This function is meant for sampling points in [0, 1] * [0, 1] coordinate space based on their uncertainty. The + uncertainty is calculated for each point using the passed `uncertainty function` that takes points logit + prediction as input. + + Args: + logits (`float`): + Logit predictions for P points. + uncertainty_function: + A function that takes logit predictions for P points and returns their uncertainties. + num_points (`int`): + The number of points P to sample. + oversample_ratio (`int`): + Oversampling parameter. + importance_sample_ratio (`float`): + Ratio of points that are sampled via importance sampling. + + Returns: + point_coordinates (`torch.Tensor`): + Coordinates for P sampled points. + """ + + num_boxes = logits.shape[0] + num_points_sampled = int(num_points * oversample_ratio) + + # Get random point coordinates + point_coordinates = torch.rand(num_boxes, num_points_sampled, 2, device=logits.device) + # Get sampled prediction value for the point coordinates + point_logits = sample_point(logits, point_coordinates, align_corners=False) + # Calculate the uncertainties based on the sampled prediction values of the points + point_uncertainties = uncertainty_function(point_logits) + + num_uncertain_points = int(importance_sample_ratio * num_points) + num_random_points = num_points - num_uncertain_points + + idx = torch.topk(point_uncertainties[:, 0, :], k=num_uncertain_points, dim=1)[1] + shift = num_points_sampled * torch.arange(num_boxes, dtype=torch.long, device=logits.device) + idx += shift[:, None] + point_coordinates = point_coordinates.view(-1, 2)[idx.view(-1), :].view(num_boxes, num_uncertain_points, 2) + + if num_random_points > 0: + point_coordinates = torch.cat( + [point_coordinates, torch.rand(num_boxes, num_random_points, 2, device=logits.device)], + dim=1, + ) + return point_coordinates + + def forward( + self, + masks_queries_logits: torch.Tensor, + class_queries_logits: torch.Tensor, + mask_labels: list[torch.Tensor], + class_labels: list[torch.Tensor], + auxiliary_predictions: Optional[dict[str, torch.Tensor]] = None, + ) -> dict[str, torch.Tensor]: + """ + This performs the loss computation. + + Args: + masks_queries_logits (`torch.Tensor`): + A tensor of shape `(batch_size, num_queries, height, width)`. + class_queries_logits (`torch.Tensor`): + A tensor of shape `(batch_size, num_queries, num_labels)`. + mask_labels (`torch.Tensor`): + List of mask labels of shape `(labels, height, width)`. + class_labels (`list[torch.Tensor]`): + List of class labels of shape `(labels)`. + auxiliary_predictions (`dict[str, torch.Tensor]`, *optional*): + if `use_auxiliary_loss` was set to `true` in [`EomtConfig`], then it contains the logits from + the inner layers of the EomtMaskedAttentionDecoder. + + Returns: + losses (`dict[str, Tensor]`): A dict of `torch.Tensor` containing three keys: + - **loss_cross_entropy** -- The loss computed using cross entropy on the predicted and ground truth labels. + - **loss_mask** -- The loss computed using sigmoid cross_entropy loss on the predicted and ground truth + masks. + - **loss_dice** -- The loss computed using dice loss on the predicted on the predicted and ground truth + masks. + if `use_auxiliary_loss` was set to `true` in [`EomtConfig`], the dictionary contains additional + losses for each auxiliary predictions. + """ + + # retrieve the matching between the outputs of the last layer and the labels + indices = self.matcher(masks_queries_logits, class_queries_logits, mask_labels, class_labels) + # compute the average number of target masks for normalization purposes + num_masks = self.get_num_masks(class_labels, device=class_labels[0].device) + # get all the losses + losses: dict[str, Tensor] = { + **self.loss_masks(masks_queries_logits, mask_labels, indices, num_masks), + **self.loss_labels(class_queries_logits, class_labels, indices), + } + # in case of auxiliary losses, we repeat this process with the output of each intermediate layer. + if auxiliary_predictions is not None: + for idx, aux_outputs in enumerate(auxiliary_predictions): + masks_queries_logits = aux_outputs["masks_queries_logits"] + class_queries_logits = aux_outputs["class_queries_logits"] + loss_dict = self.forward(masks_queries_logits, class_queries_logits, mask_labels, class_labels) + loss_dict = {f"{key}_{idx}": value for key, value in loss_dict.items()} + losses.update(loss_dict) + + return losses + + def get_num_masks(self, class_labels: torch.Tensor, device: torch.device) -> torch.Tensor: + """ + Computes the average number of target masks across the batch, for normalization purposes. + """ + num_masks = sum([len(classes) for classes in class_labels]) + num_masks = torch.as_tensor(num_masks, dtype=torch.float, device=device) + world_size = 1 + if is_accelerate_available(): + if PartialState._shared_state != {}: + num_masks = reduce(num_masks) + world_size = PartialState().num_processes + + num_masks = torch.clamp(num_masks / world_size, min=1) + return num_masks + + +class EomtPatchEmbeddings(nn.Module): + """ + This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial + `hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a + Transformer. + """ + + def __init__(self, config): + super().__init__() + image_size, patch_size = config.image_size, config.patch_size + num_channels, hidden_size = config.num_channels, config.hidden_size + + image_size = image_size if isinstance(image_size, collections.abc.Iterable) else (image_size, image_size) + patch_size = patch_size if isinstance(patch_size, collections.abc.Iterable) else (patch_size, patch_size) + num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0]) + self.image_size = image_size + self.patch_size = patch_size + self.num_channels = num_channels + self.num_patches = num_patches + + self.projection = nn.Conv2d(num_channels, hidden_size, kernel_size=patch_size, stride=patch_size) + + def forward(self, pixel_values: torch.Tensor) -> torch.Tensor: + num_channels = pixel_values.shape[1] + if num_channels != self.num_channels: + raise ValueError( + "Make sure that the channel dimension of the pixel values match with the one set in the configuration." + f" Expected {self.num_channels} but got {num_channels}." + ) + embeddings = self.projection(pixel_values).flatten(2).transpose(1, 2) + return embeddings + + +class EomtEmbeddings(nn.Module): + """ + Construct the CLS token, mask token, position and patch embeddings. + """ + + def __init__(self, config: EomtConfig) -> None: + super().__init__() + + self.config = config + self.patch_size = config.patch_size + + self.cls_token = nn.Parameter(torch.randn(1, 1, config.hidden_size)) + self.register_tokens = nn.Parameter(torch.zeros(1, config.num_register_tokens, config.hidden_size)) + + self.patch_embeddings = EomtPatchEmbeddings(config) + num_patches = self.patch_embeddings.num_patches + self.dropout = nn.Dropout(config.hidden_dropout_prob) + self.num_prefix_tokens = 1 + config.num_register_tokens # 1 for [CLS] + self.position_embeddings = nn.Embedding(num_patches, config.hidden_size) + self.register_buffer("position_ids", torch.arange(num_patches).expand((1, -1)), persistent=False) + + def forward(self, pixel_values: torch.Tensor) -> torch.Tensor: + batch_size, _, _, _ = pixel_values.shape + target_dtype = self.patch_embeddings.projection.weight.dtype + embeddings = self.patch_embeddings(pixel_values.to(dtype=target_dtype)) + + cls_tokens = self.cls_token.expand(batch_size, -1, -1) + register_tokens = self.register_tokens.expand(batch_size, -1, -1) + + embeddings = embeddings + self.position_embeddings(self.position_ids) + embeddings = torch.cat([cls_tokens, register_tokens, embeddings], dim=1) + + embeddings = self.dropout(embeddings) + + return embeddings + + +def eager_attention_forward( + module: nn.Module, + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + attention_mask: Optional[torch.Tensor], + scaling: float, + dropout: float = 0.0, + **kwargs, +): + attn_weights = torch.matmul(query, key.transpose(-1, -2)) * scaling + if attention_mask is not None: + attn_weights = attn_weights + attention_mask + + attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype) + attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training) + + attn_output = torch.matmul(attn_weights, value) + attn_output = attn_output.transpose(1, 2).contiguous() + + return attn_output, attn_weights + + +class EomtAttention(nn.Module): + """Multi-headed attention from 'Attention Is All You Need' paper""" + + def __init__(self, config): + super().__init__() + self.config = config + self.embed_dim = config.hidden_size + self.num_heads = config.num_attention_heads + self.head_dim = self.embed_dim // self.num_heads + if self.head_dim * self.num_heads != self.embed_dim: + raise ValueError( + f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:" + f" {self.num_heads})." + ) + self.scale = self.head_dim**-0.5 + self.dropout = config.attention_dropout + self.is_causal = False + + self.k_proj = nn.Linear(self.embed_dim, self.embed_dim) + self.v_proj = nn.Linear(self.embed_dim, self.embed_dim) + self.q_proj = nn.Linear(self.embed_dim, self.embed_dim) + self.out_proj = nn.Linear(self.embed_dim, self.embed_dim) + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + **kwargs, + ) -> tuple[torch.Tensor, Optional[torch.Tensor]]: + """Input shape: Batch x Time x Channel""" + + batch_size, seq_length, embed_dim = hidden_states.shape + + queries = self.q_proj(hidden_states) + keys = self.k_proj(hidden_states) + values = self.v_proj(hidden_states) + + queries = queries.view(batch_size, seq_length, self.num_heads, self.head_dim).transpose(1, 2) + keys = keys.view(batch_size, seq_length, self.num_heads, self.head_dim).transpose(1, 2) + values = values.view(batch_size, seq_length, self.num_heads, self.head_dim).transpose(1, 2) + + attention_interface: Callable = eager_attention_forward + if self.config._attn_implementation != "eager": + attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation] + + attn_output, attn_weights = attention_interface( + self, + queries, + keys, + values, + attention_mask, + is_causal=self.is_causal, + scaling=self.scale, + dropout=0.0 if not self.training else self.dropout, + ) + + attn_output = attn_output.reshape(batch_size, seq_length, embed_dim).contiguous() + attn_output = self.out_proj(attn_output) + + return attn_output, attn_weights + + +class EomtLayerScale(nn.Module): + def __init__(self, config) -> None: + super().__init__() + self.lambda1 = nn.Parameter(config.layerscale_value * torch.ones(config.hidden_size)) + + def forward(self, hidden_state: torch.Tensor) -> torch.Tensor: + return hidden_state * self.lambda1 + + +def drop_path(input: torch.Tensor, drop_prob: float = 0.0, training: bool = False) -> torch.Tensor: + """ + Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks). + + Comment by Ross Wightman: This is the same as the DropConnect impl I created for EfficientNet, etc networks, + however, the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper... + See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for changing the + layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use 'survival rate' as the + argument. + """ + if drop_prob == 0.0 or not training: + return input + keep_prob = 1 - drop_prob + shape = (input.shape[0],) + (1,) * (input.ndim - 1) # work with diff dim tensors, not just 2D ConvNets + random_tensor = keep_prob + torch.rand(shape, dtype=input.dtype, device=input.device) + random_tensor.floor_() # binarize + output = input.div(keep_prob) * random_tensor + return output + + +class EomtDropPath(nn.Module): + """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).""" + + def __init__(self, drop_prob: Optional[float] = None) -> None: + super().__init__() + self.drop_prob = drop_prob + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + return drop_path(hidden_states, self.drop_prob, self.training) + + def extra_repr(self) -> str: + return f"p={self.drop_prob}" + + +class EomtMLP(nn.Module): + def __init__(self, config) -> None: + super().__init__() + in_features = out_features = config.hidden_size + hidden_features = int(config.hidden_size * config.mlp_ratio) + self.fc1 = nn.Linear(in_features, hidden_features, bias=True) + if isinstance(config.hidden_act, str): + self.activation = ACT2FN[config.hidden_act] + else: + self.activation = config.hidden_act + self.fc2 = nn.Linear(hidden_features, out_features, bias=True) + + def forward(self, hidden_state: torch.Tensor) -> torch.Tensor: + hidden_state = self.fc1(hidden_state) + hidden_state = self.activation(hidden_state) + hidden_state = self.fc2(hidden_state) + return hidden_state + + +class EomtSwiGLUFFN(nn.Module): + def __init__(self, config) -> None: + super().__init__() + in_features = out_features = config.hidden_size + hidden_features = int(config.hidden_size * config.mlp_ratio) + hidden_features = (int(hidden_features * 2 / 3) + 7) // 8 * 8 + + self.weights_in = nn.Linear(in_features, 2 * hidden_features, bias=True) + self.weights_out = nn.Linear(hidden_features, out_features, bias=True) + + def forward(self, hidden_state: torch.Tensor) -> torch.Tensor: + hidden_state = self.weights_in(hidden_state) + x1, x2 = hidden_state.chunk(2, dim=-1) + hidden = nn.functional.silu(x1) * x2 + return self.weights_out(hidden) + + +class EomtLayer(GradientCheckpointingLayer): + """This corresponds to the Block class in the original implementation.""" + + def __init__(self, config: EomtConfig) -> None: + super().__init__() + + self.norm1 = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.attention = EomtAttention(config) + self.layer_scale1 = EomtLayerScale(config) + self.drop_path = EomtDropPath(config.drop_path_rate) if config.drop_path_rate > 0.0 else nn.Identity() + + self.norm2 = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + + if config.use_swiglu_ffn: + self.mlp = EomtSwiGLUFFN(config) + else: + self.mlp = EomtMLP(config) + self.layer_scale2 = EomtLayerScale(config) + + def forward( + self, + hidden_states: torch.Tensor, + head_mask: Optional[torch.Tensor] = None, + output_attentions: bool = False, + ) -> Union[tuple[torch.Tensor, torch.Tensor], tuple[torch.Tensor]]: + self_attention_outputs = self.attention( + self.norm1(hidden_states), # in Eomt, layernorm is applied before self-attention + head_mask, + output_attentions=output_attentions, + ) + attention_output = self_attention_outputs[0] + + attention_output = self.layer_scale1(attention_output) + outputs = self_attention_outputs[1:] # add self attentions if we output attention weights + + # first residual connection + hidden_states = self.drop_path(attention_output) + hidden_states + + # in Eomt, layernorm is also applied after self-attention + layer_output = self.norm2(hidden_states) + layer_output = self.mlp(layer_output) + layer_output = self.layer_scale2(layer_output) + + # second residual connection + layer_output = self.drop_path(layer_output) + hidden_states + + outputs = (layer_output,) + outputs + + return outputs + + +class EomtLayerNorm2d(nn.LayerNorm): + def __init__(self, num_channels, eps=1e-6, affine=True): + super().__init__(num_channels, eps=eps, elementwise_affine=affine) + + def forward(self, hidden_state: torch.Tensor) -> torch.Tensor: + hidden_state = hidden_state.permute(0, 2, 3, 1) + hidden_state = F.layer_norm(hidden_state, self.normalized_shape, self.weight, self.bias, self.eps) + hidden_state = hidden_state.permute(0, 3, 1, 2) + return hidden_state + + +class EomtScaleLayer(nn.Module): + def __init__(self, config: EomtConfig): + super().__init__() + hidden_size = config.hidden_size + self.conv1 = nn.ConvTranspose2d(hidden_size, hidden_size, kernel_size=2, stride=2) + self.activation = ACT2FN[config.hidden_act] + self.conv2 = nn.Conv2d( + hidden_size, + hidden_size, + kernel_size=3, + padding=1, + groups=hidden_size, + bias=False, + ) + + self.layernorm2d = EomtLayerNorm2d(hidden_size) + + def forward(self, hidden_states: torch.tensor) -> torch.Tensor: + hidden_states = self.conv1(hidden_states) + hidden_states = self.activation(hidden_states) + hidden_states = self.conv2(hidden_states) + hidden_states = self.layernorm2d(hidden_states) + return hidden_states + + +class EomtScaleBlock(nn.Module): + def __init__(self, config: EomtConfig): + super().__init__() + self.num_blocks = config.num_upscale_blocks + self.block = nn.ModuleList([EomtScaleLayer(config) for _ in range(self.num_blocks)]) + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + for block in self.block: + hidden_states = block(hidden_states) + return hidden_states + + +class EomtMaskHead(nn.Module): + def __init__(self, config: EomtConfig): + super().__init__() + + hidden_size = config.hidden_size + self.fc1 = nn.Linear(hidden_size, hidden_size) + self.fc2 = nn.Linear(hidden_size, hidden_size) + self.fc3 = nn.Linear(hidden_size, hidden_size) + self.activation = ACT2FN[config.hidden_act] + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + hidden_states = self.activation(self.fc1(hidden_states)) + hidden_states = self.activation(self.fc2(hidden_states)) + hidden_states = self.fc3(hidden_states) + return hidden_states + + +@auto_docstring +class EomtPreTrainedModel(PreTrainedModel): + """ + An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained + models. + """ + + config_class = EomtConfig + base_model_prefix = "eomt" + main_input_name = "pixel_values" + supports_gradient_checkpointing = False + _no_split_modules = ["EomtLayer"] + _supports_sdpa = True + _supports_flash_attn_2 = True + _supports_flash_attn_3 = True + + def _init_weights(self, module: nn.Module) -> None: + std = self.config.initializer_range + if isinstance(module, (nn.Linear, nn.Conv2d, nn.ConvTranspose2d)): + nn.init.kaiming_uniform_(module.weight, a=math.sqrt(5)) + if module.bias is not None: + fan_in, _ = nn.init._calculate_fan_in_and_fan_out(module.weight) + bound = 1 / math.sqrt(fan_in) if fan_in > 0 else 0 + nn.init.uniform_(module.bias, -bound, bound) + elif isinstance(module, nn.LayerNorm): + module.weight.data.fill_(1.0) + module.bias.data.zero_() + elif isinstance(module, nn.Embedding): + module.weight.data.normal_(mean=0.0, std=1) + if module.padding_idx is not None: + module.weight.data[module.padding_idx].zero_() + elif isinstance(module, EomtLayerScale): + if hasattr(module, "lambda1"): + module.lambda1.data.fill_(self.config.layerscale_value) + elif isinstance(module, EomtEmbeddings): + module.cls_token.data = nn.init.trunc_normal_( + module.cls_token.data.to(torch.float32), mean=0.0, std=std + ).to(module.cls_token.dtype) + module.register_tokens.data.zero_() + + +@auto_docstring( + custom_intro=""" + The EoMT Model with head on top for instance/semantic/panoptic segmentation. + """ +) +class EomtForUniversalSegmentation(EomtPreTrainedModel): + main_input_name = "pixel_values" + + def __init__(self, config: EomtConfig) -> None: + super().__init__(config) + self.config = config + self.num_hidden_layers = config.num_hidden_layers + self.embeddings = EomtEmbeddings(config) + self.layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + + self.query = nn.Embedding(config.num_queries, config.hidden_size) + self.layers = nn.ModuleList([EomtLayer(config) for _ in range(config.num_hidden_layers)]) + + self.upscale_block = EomtScaleBlock(config) + self.mask_head = EomtMaskHead(config) + + self.class_predictor = nn.Linear(config.hidden_size, config.num_labels + 1) + + self.grid_size = (config.image_size // config.patch_size, config.image_size // config.patch_size) + self.weight_dict: dict[str, float] = { + "loss_cross_entropy": config.class_weight, + "loss_mask": config.mask_weight, + "loss_dice": config.dice_weight, + } + + self.criterion = EomtLoss(config=config, weight_dict=self.weight_dict) + + self.register_buffer("attn_mask_probs", torch.ones(config.num_blocks)) + + self.post_init() + + def get_loss_dict( + self, + masks_queries_logits: Tensor, + class_queries_logits: Tensor, + mask_labels: Tensor, + class_labels: Tensor, + auxiliary_predictions: dict[str, Tensor], + ) -> dict[str, Tensor]: + loss_dict: dict[str, Tensor] = self.criterion( + masks_queries_logits=masks_queries_logits, + class_queries_logits=class_queries_logits, + mask_labels=mask_labels, + class_labels=class_labels, + auxiliary_predictions=auxiliary_predictions, + ) + + # weight each loss by `self.weight_dict[]` including auxiliary losses + for key, weight in self.weight_dict.items(): + for loss_key, loss in loss_dict.items(): + if key in loss_key: + loss *= weight + + return loss_dict + + def get_loss(self, loss_dict: dict[str, Tensor]) -> Tensor: + return sum(loss_dict.values()) + + @auto_docstring + @can_return_tuple + def forward( + self, + pixel_values: Tensor, + mask_labels: Optional[list[Tensor]] = None, + class_labels: Optional[list[Tensor]] = None, + output_hidden_states: Optional[bool] = None, + output_attentions: Optional[bool] = None, + patch_offsets: Optional[list[Tensor]] = None, + ) -> EomtForUniversalSegmentationOutput: + r""" + mask_labels (`list[torch.Tensor]`, *optional*): + list of mask labels of shape `(num_labels, height, width)` to be fed to a model + class_labels (`list[torch.LongTensor]`, *optional*): + list of target class labels of shape `(num_labels, height, width)` to be fed to a model. They identify the + labels of `mask_labels`, e.g. the label of `mask_labels[i][j]` if `class_labels[i][j]`. + patch_offsets (`list[torch.Tensor]`, *optional*): + list of tuples indicating the image index and start and end positions of patches for semantic segementation. + """ + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + + all_hidden_states = () if output_hidden_states else None + all_attentions = () if output_attentions else None + + masks_queries_logits_per_layer, class_queries_logits_per_layer = (), () + attention_mask = None + + if pixel_values is None: + raise ValueError("You have to specify pixel_values") + + hidden_states = self.embeddings(pixel_values) + + for idx, layer_module in enumerate(self.layers): + if output_hidden_states: + all_hidden_states += (hidden_states,) + + if idx == self.num_hidden_layers - self.config.num_blocks: + query = self.query.weight[None, :, :].expand(hidden_states.shape[0], -1, -1).to(hidden_states.device) + hidden_states = torch.cat((query, hidden_states), dim=1) + + if idx >= self.num_hidden_layers - self.config.num_blocks and ( + self.training or self.attn_mask_probs[idx - self.num_hidden_layers + self.config.num_blocks] > 0 + ): + norm_hidden_states = self.layernorm(hidden_states) + masks_queries_logits, class_queries_logits = self.predict(norm_hidden_states) + + masks_queries_logits_per_layer += (masks_queries_logits,) + class_queries_logits_per_layer += (class_queries_logits,) + + attention_mask = torch.ones( + hidden_states.shape[0], + hidden_states.shape[1], + hidden_states.shape[1], + device=hidden_states.device, + dtype=torch.bool, + ) + + interpolated_logits = F.interpolate(masks_queries_logits, size=self.grid_size, mode="bilinear") + interpolated_logits = interpolated_logits.view( + interpolated_logits.size(0), interpolated_logits.size(1), -1 + ) + + num_query_tokens = self.config.num_queries + encoder_start_tokens = num_query_tokens + self.embeddings.num_prefix_tokens + + # Set attention mask for queries to focus on encoder tokens based on interpolated logits + attention_mask[:, :num_query_tokens, encoder_start_tokens:] = interpolated_logits > 0 + + # Disable attention mask for random query tokens. + attention_mask = self._disable_attention_mask( + attention_mask, + prob=self.attn_mask_probs[idx - self.num_hidden_layers + self.config.num_blocks], + num_query_tokens=num_query_tokens, + encoder_start_tokens=encoder_start_tokens, + device=attention_mask.device, + ) + + # Expand attention mask to 4d mask. + attention_mask = attention_mask[:, None, ...].expand(-1, self.config.num_attention_heads, -1, -1) + attention_mask = attention_mask.float().masked_fill(~attention_mask, -1e9) + + layer_outputs = layer_module(hidden_states, attention_mask, output_attentions) + hidden_states = layer_outputs[0] + + if output_attentions: + all_attentions += (layer_outputs[1],) + + sequence_output = self.layernorm(hidden_states) + if output_hidden_states: + all_hidden_states += (sequence_output,) + + masks_queries_logits, class_queries_logits = self.predict(sequence_output) + masks_queries_logits_per_layer += (masks_queries_logits,) + class_queries_logits_per_layer += (class_queries_logits,) + + loss = None + if mask_labels is not None and class_labels is not None: + loss = 0.0 + for masks_queries_logits, class_queries_logits in zip( + masks_queries_logits_per_layer, class_queries_logits_per_layer + ): + loss_dict = self.get_loss_dict( + masks_queries_logits=masks_queries_logits, + class_queries_logits=class_queries_logits, + mask_labels=mask_labels, + class_labels=class_labels, + auxiliary_predictions=None, + ) + loss += self.get_loss(loss_dict) + + return EomtForUniversalSegmentationOutput( + loss=loss, + masks_queries_logits=masks_queries_logits, + class_queries_logits=class_queries_logits, + last_hidden_state=sequence_output, + hidden_states=all_hidden_states, + attentions=all_attentions, + patch_offsets=patch_offsets, + ) + + def get_input_embeddings(self): + return self.embeddings.patch_embeddings + + def predict(self, logits: torch.Tensor): + query_tokens = logits[:, : self.config.num_queries, :] + class_logits = self.class_predictor(query_tokens) + + prefix_tokens = logits[:, self.config.num_queries + self.embeddings.num_prefix_tokens :, :] + prefix_tokens = prefix_tokens.transpose(1, 2) + + prefix_tokens = prefix_tokens.reshape(prefix_tokens.shape[0], -1, *self.grid_size) + + query_tokens = self.mask_head(query_tokens) + prefix_tokens = self.upscale_block(prefix_tokens) + + mask_logits = torch.einsum("bqc, bchw -> bqhw", query_tokens, prefix_tokens) + + return mask_logits, class_logits + + @staticmethod + def _disable_attention_mask(attn_mask, prob, num_query_tokens, encoder_start_tokens, device): + if prob < 1: + # Generate random queries to disable based on the probs + random_queries = torch.rand(attn_mask.shape[0], num_query_tokens, device=device) > prob + + # Disable attention to the query tokens, considering the prefix tokens + attn_mask[:, :num_query_tokens, encoder_start_tokens:][random_queries] = 1 + + return attn_mask + + +__all__ = ["EomtPreTrainedModel", "EomtForUniversalSegmentation"] diff --git a/transformers/src/transformers/models/eomt/modular_eomt.py b/transformers/src/transformers/models/eomt/modular_eomt.py new file mode 100644 index 0000000000000000000000000000000000000000..aeb562fc26b4751d8f7e1eed5ec584dada929084 --- /dev/null +++ b/transformers/src/transformers/models/eomt/modular_eomt.py @@ -0,0 +1,596 @@ +# coding=utf-8 +# Copyright 2025 Mobile Perception Systems Lab at TU/e and The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""PyTorch EoMT model.""" + +import math +from dataclasses import dataclass +from typing import Optional + +import torch +import torch.nn.functional as F +from torch import Tensor, nn + +from ...activations import ACT2FN +from ...file_utils import ( + ModelOutput, +) +from ...modeling_utils import PreTrainedModel +from ...utils import ( + auto_docstring, + can_return_tuple, + logging, +) +from ..dinov2.modeling_dinov2 import ( + Dinov2Embeddings, + Dinov2Layer, + Dinov2LayerScale, + Dinov2PatchEmbeddings, +) +from ..mask2former.modeling_mask2former import Mask2FormerForUniversalSegmentation, Mask2FormerLoss +from ..siglip.modeling_siglip import SiglipAttention +from ..vit.configuration_vit import ViTConfig + + +logger = logging.get_logger(__name__) + + +class EomtConfig(ViTConfig): + r""" + This is the configuration class to store the configuration of a [`EomtForUniversalSegmentation`]. It is used to instantiate an EoMT model + according to the specified arguments, defining the model architecture. Instantiating a configuration with the + defaults will yield a similar configuration to that of the EoMT + [tue-mps/coco_panoptic_eomt_large_640](https://huggingface.co/tue-mps/coco_panoptic_eomt_large_640) + architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + Args: + hidden_size (`int`, *optional*, defaults to 1024): + Dimensionality of the hidden representations. + num_hidden_layers (`int`, *optional*, defaults to 24): + Number of hidden layers in the Transformer encoder. + num_attention_heads (`int`, *optional*, defaults to 16): + Number of attention heads in each attention layer. + mlp_ratio (`int`, *optional*, defaults to 4): + Ratio of the MLP hidden dimensionality to the hidden size. + hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`): + The non-linear activation function (function or string) in the encoder. + hidden_dropout_prob (`float`, *optional*, defaults to 0.0): + The dropout probability for all fully connected layers in the embeddings and encoder. + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + layer_norm_eps (`float`, *optional*, defaults to 1e-06): + The epsilon used by the layer normalization layers. + image_size (`int`, *optional*, defaults to 640): + The size (resolution) of each input image. + patch_size (`int`, *optional*, defaults to 16): + The size (resolution) of each patch. + num_channels (`int`, *optional*, defaults to 3): + The number of input channels. + layerscale_value (`float`, *optional*, defaults to 1.0): + Initial value for the LayerScale parameter. + drop_path_rate (`float`, *optional*, defaults to 0.0): + The stochastic depth rate (drop path) used during training. + num_upscale_blocks (`int`, *optional*, defaults to 2): + Number of upsampling blocks used in the decoder or segmentation head. + attention_dropout (`float`, *optional*, defaults to 0.0): + Dropout probability applied after attention projection. + use_swiglu_ffn (`bool`, *optional*, defaults to `False`): + Whether to use the SwiGLU feedforward neural network. + num_blocks (`int`, *optional*, defaults to 4): + Number of feature blocks or stages in the architecture. + no_object_weight (`float`, *optional*, defaults to 0.1): + Loss weight for the 'no object' class in panoptic/instance segmentation. + class_weight (`float`, *optional*, defaults to 2.0): + Loss weight for classification targets. + mask_weight (`float`, *optional*, defaults to 5.0): + Loss weight for mask prediction. + dice_weight (`float`, *optional*, defaults to 5.0): + Loss weight for the dice loss component. + train_num_points (`int`, *optional*, defaults to 12544): + Number of points to sample for mask loss computation during training. + oversample_ratio (`float`, *optional*, defaults to 3.0): + Oversampling ratio used in point sampling for mask training. + importance_sample_ratio (`float`, *optional*, defaults to 0.75): + Ratio of points to sample based on importance during training. + num_queries (`int`, *optional*, defaults to 200): + Number of object queries in the Transformer. + num_register_tokens (`int`, *optional*, defaults to 4): + Number of learnable register tokens added to the transformer input. + + Example: + + ```python + >>> from transformers import EomtConfig, EomtForUniversalSegmentation + + >>> # Initialize configuration + >>> config = EomtConfig() + + >>> # Initialize model + >>> model = EomtForUniversalSegmentation(config) + + >>> # Access config + >>> config = model.config + ```""" + + model_type = "eomt" + + def __init__( + self, + hidden_size=1024, + num_hidden_layers=24, + num_attention_heads=16, + mlp_ratio=4, + hidden_act="gelu", + hidden_dropout_prob=0.0, + initializer_range=0.02, + layer_norm_eps=1e-6, + image_size=640, + patch_size=16, + num_channels=3, + layerscale_value=1.0, + drop_path_rate=0.0, + num_upscale_blocks=2, + attention_dropout=0.0, + use_swiglu_ffn=False, + num_blocks=4, + no_object_weight: float = 0.1, + class_weight: float = 2.0, + mask_weight: float = 5.0, + dice_weight: float = 5.0, + train_num_points: int = 12544, + oversample_ratio: float = 3.0, + importance_sample_ratio: float = 0.75, + num_queries=200, + num_register_tokens=4, + **kwargs, + ): + super().__init__( + hidden_size=hidden_size, + num_hidden_layers=num_hidden_layers, + num_attention_heads=num_attention_heads, + hidden_dropout_prob=hidden_dropout_prob, + hidden_act=hidden_act, + initializer_range=initializer_range, + layer_norm_eps=layer_norm_eps, + image_size=image_size, + patch_size=patch_size, + num_channels=num_channels, + **kwargs, + ) + + del self.intermediate_size + del self.qkv_bias + del self.pooler_act + del self.pooler_output_size + del self.encoder_stride + del self.attention_probs_dropout_prob + + self.mlp_ratio = mlp_ratio + self.attention_dropout = attention_dropout + self.layerscale_value = layerscale_value + self.drop_path_rate = drop_path_rate + self.num_upscale_blocks = num_upscale_blocks + self.use_swiglu_ffn = use_swiglu_ffn + self.num_blocks = num_blocks + self.no_object_weight = no_object_weight + self.class_weight = class_weight + self.mask_weight = mask_weight + self.dice_weight = dice_weight + self.train_num_points = train_num_points + self.oversample_ratio = oversample_ratio + self.importance_sample_ratio = importance_sample_ratio + self.num_queries = num_queries + self.num_register_tokens = num_register_tokens + + +@dataclass +@auto_docstring( + custom_intro=""" + Class for outputs of [`EomtForUniversalSegmentationOutput`]. + + This output can be directly passed to [`~EomtImageProcessor.post_process_semantic_segmentation`] or + [`~EomtImageProcessor.post_process_instance_segmentation`] or + [`~EomtImageProcessor.post_process_panoptic_segmentation`] to compute final segmentation maps. Please, see + [`~EomtImageProcessor] for details regarding usage. + """ +) +class EomtForUniversalSegmentationOutput(ModelOutput): + r""" + loss (`torch.Tensor`, *optional*): + The computed loss, returned when labels are present. + class_queries_logits (`torch.FloatTensor`): + A tensor of shape `(batch_size, num_queries, num_labels + 1)` representing the proposed classes for each + query. Note the `+ 1` is needed because we incorporate the null class. + masks_queries_logits (`torch.FloatTensor`): + A tensor of shape `(batch_size, num_queries, height, width)` representing the proposed masks for each + query. + last_hidden_state (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`): + Last hidden states (final feature map) of the last layer. + hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of + shape `(batch_size, sequence_length, hidden_size)`. Hidden-states all layers of the model. + attentions (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `tuple(torch.FloatTensor)` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. Self and Cross Attentions weights from transformer decoder. + patch_offsets (`list[torch.Tensor]`, *optional*): + list of tuples indicating the image index and start and end positions of patches for semantic segementation. + """ + + loss: Optional[torch.FloatTensor] = None + class_queries_logits: Optional[torch.FloatTensor] = None + masks_queries_logits: Optional[torch.FloatTensor] = None + last_hidden_state: Optional[torch.FloatTensor] = None + hidden_states: Optional[tuple[torch.FloatTensor]] = None + attentions: Optional[tuple[torch.FloatTensor]] = None + patch_offsets: Optional[list[torch.Tensor]] = None + + +class EomtLoss(Mask2FormerLoss): + pass + + +class EomtPatchEmbeddings(Dinov2PatchEmbeddings): + pass + + +class EomtEmbeddings(Dinov2Embeddings, nn.Module): + def __init__(self, config: EomtConfig) -> None: + Dinov2Embeddings().__init__() + + self.config = config + self.patch_size = config.patch_size + + self.cls_token = nn.Parameter(torch.randn(1, 1, config.hidden_size)) + self.register_tokens = nn.Parameter(torch.zeros(1, config.num_register_tokens, config.hidden_size)) + + self.patch_embeddings = EomtPatchEmbeddings(config) + num_patches = self.patch_embeddings.num_patches + self.dropout = nn.Dropout(config.hidden_dropout_prob) + self.num_prefix_tokens = 1 + config.num_register_tokens # 1 for [CLS] + self.position_embeddings = nn.Embedding(num_patches, config.hidden_size) + self.register_buffer("position_ids", torch.arange(num_patches).expand((1, -1)), persistent=False) + + def interpolate_pos_encoding(self): + raise AttributeError("Not needed for Eomt Model") + + def forward(self, pixel_values: torch.Tensor) -> torch.Tensor: + batch_size, _, _, _ = pixel_values.shape + target_dtype = self.patch_embeddings.projection.weight.dtype + embeddings = self.patch_embeddings(pixel_values.to(dtype=target_dtype)) + + cls_tokens = self.cls_token.expand(batch_size, -1, -1) + register_tokens = self.register_tokens.expand(batch_size, -1, -1) + + embeddings = embeddings + self.position_embeddings(self.position_ids) + embeddings = torch.cat([cls_tokens, register_tokens, embeddings], dim=1) + + embeddings = self.dropout(embeddings) + + return embeddings + + +class EomtAttention(SiglipAttention): + pass + + +class EomtLayerScale(Dinov2LayerScale): + pass + + +class EomtLayer(Dinov2Layer): + pass + + +class EomtLayerNorm2d(nn.LayerNorm): + def __init__(self, num_channels, eps=1e-6, affine=True): + super().__init__(num_channels, eps=eps, elementwise_affine=affine) + + def forward(self, hidden_state: torch.Tensor) -> torch.Tensor: + hidden_state = hidden_state.permute(0, 2, 3, 1) + hidden_state = F.layer_norm(hidden_state, self.normalized_shape, self.weight, self.bias, self.eps) + hidden_state = hidden_state.permute(0, 3, 1, 2) + return hidden_state + + +class EomtScaleLayer(nn.Module): + def __init__(self, config: EomtConfig): + super().__init__() + hidden_size = config.hidden_size + self.conv1 = nn.ConvTranspose2d(hidden_size, hidden_size, kernel_size=2, stride=2) + self.activation = ACT2FN[config.hidden_act] + self.conv2 = nn.Conv2d( + hidden_size, + hidden_size, + kernel_size=3, + padding=1, + groups=hidden_size, + bias=False, + ) + + self.layernorm2d = EomtLayerNorm2d(hidden_size) + + def forward(self, hidden_states: torch.tensor) -> torch.Tensor: + hidden_states = self.conv1(hidden_states) + hidden_states = self.activation(hidden_states) + hidden_states = self.conv2(hidden_states) + hidden_states = self.layernorm2d(hidden_states) + return hidden_states + + +class EomtScaleBlock(nn.Module): + def __init__(self, config: EomtConfig): + super().__init__() + self.num_blocks = config.num_upscale_blocks + self.block = nn.ModuleList([EomtScaleLayer(config) for _ in range(self.num_blocks)]) + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + for block in self.block: + hidden_states = block(hidden_states) + return hidden_states + + +class EomtMaskHead(nn.Module): + def __init__(self, config: EomtConfig): + super().__init__() + + hidden_size = config.hidden_size + self.fc1 = nn.Linear(hidden_size, hidden_size) + self.fc2 = nn.Linear(hidden_size, hidden_size) + self.fc3 = nn.Linear(hidden_size, hidden_size) + self.activation = ACT2FN[config.hidden_act] + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + hidden_states = self.activation(self.fc1(hidden_states)) + hidden_states = self.activation(self.fc2(hidden_states)) + hidden_states = self.fc3(hidden_states) + return hidden_states + + +@auto_docstring +class EomtPreTrainedModel(PreTrainedModel): + """ + An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained + models. + """ + + config_class = EomtConfig + base_model_prefix = "eomt" + main_input_name = "pixel_values" + supports_gradient_checkpointing = False + _no_split_modules = ["EomtLayer"] + _supports_sdpa = True + _supports_flash_attn_2 = True + _supports_flash_attn_3 = True + + def _init_weights(self, module: nn.Module) -> None: + std = self.config.initializer_range + if isinstance(module, (nn.Linear, nn.Conv2d, nn.ConvTranspose2d)): + nn.init.kaiming_uniform_(module.weight, a=math.sqrt(5)) + if module.bias is not None: + fan_in, _ = nn.init._calculate_fan_in_and_fan_out(module.weight) + bound = 1 / math.sqrt(fan_in) if fan_in > 0 else 0 + nn.init.uniform_(module.bias, -bound, bound) + elif isinstance(module, nn.LayerNorm): + module.weight.data.fill_(1.0) + module.bias.data.zero_() + elif isinstance(module, nn.Embedding): + module.weight.data.normal_(mean=0.0, std=1) + if module.padding_idx is not None: + module.weight.data[module.padding_idx].zero_() + elif isinstance(module, EomtLayerScale): + if hasattr(module, "lambda1"): + module.lambda1.data.fill_(self.config.layerscale_value) + elif isinstance(module, EomtEmbeddings): + module.cls_token.data = nn.init.trunc_normal_( + module.cls_token.data.to(torch.float32), mean=0.0, std=std + ).to(module.cls_token.dtype) + module.register_tokens.data.zero_() + + +@auto_docstring( + custom_intro=""" + The EoMT Model with head on top for instance/semantic/panoptic segmentation. + """ +) +class EomtForUniversalSegmentation(Mask2FormerForUniversalSegmentation, nn.Module): + def __init__(self, config: EomtConfig) -> None: + nn.Module().__init__(config) + self.config = config + self.num_hidden_layers = config.num_hidden_layers + self.embeddings = EomtEmbeddings(config) + self.layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + + self.query = nn.Embedding(config.num_queries, config.hidden_size) + self.layers = nn.ModuleList([EomtLayer(config) for _ in range(config.num_hidden_layers)]) + + self.upscale_block = EomtScaleBlock(config) + self.mask_head = EomtMaskHead(config) + + self.class_predictor = nn.Linear(config.hidden_size, config.num_labels + 1) + + self.grid_size = (config.image_size // config.patch_size, config.image_size // config.patch_size) + self.weight_dict: dict[str, float] = { + "loss_cross_entropy": config.class_weight, + "loss_mask": config.mask_weight, + "loss_dice": config.dice_weight, + } + + self.criterion = EomtLoss(config=config, weight_dict=self.weight_dict) + + self.register_buffer("attn_mask_probs", torch.ones(config.num_blocks)) + + self.post_init() + + def get_input_embeddings(self): + return self.embeddings.patch_embeddings + + def get_auxiliary_logits(self): + raise AttributeError("Note needed for Eomt Model.") + + def predict(self, logits: torch.Tensor): + query_tokens = logits[:, : self.config.num_queries, :] + class_logits = self.class_predictor(query_tokens) + + prefix_tokens = logits[:, self.config.num_queries + self.embeddings.num_prefix_tokens :, :] + prefix_tokens = prefix_tokens.transpose(1, 2) + + prefix_tokens = prefix_tokens.reshape(prefix_tokens.shape[0], -1, *self.grid_size) + + query_tokens = self.mask_head(query_tokens) + prefix_tokens = self.upscale_block(prefix_tokens) + + mask_logits = torch.einsum("bqc, bchw -> bqhw", query_tokens, prefix_tokens) + + return mask_logits, class_logits + + @staticmethod + def _disable_attention_mask(attn_mask, prob, num_query_tokens, encoder_start_tokens, device): + if prob < 1: + # Generate random queries to disable based on the probs + random_queries = torch.rand(attn_mask.shape[0], num_query_tokens, device=device) > prob + + # Disable attention to the query tokens, considering the prefix tokens + attn_mask[:, :num_query_tokens, encoder_start_tokens:][random_queries] = 1 + + return attn_mask + + @auto_docstring + @can_return_tuple + def forward( + self, + pixel_values: Tensor, + mask_labels: Optional[list[Tensor]] = None, + class_labels: Optional[list[Tensor]] = None, + output_hidden_states: Optional[bool] = None, + output_attentions: Optional[bool] = None, + patch_offsets: Optional[list[Tensor]] = None, + ): + r""" + mask_labels (`list[torch.Tensor]`, *optional*): + list of mask labels of shape `(num_labels, height, width)` to be fed to a model + class_labels (`list[torch.LongTensor]`, *optional*): + list of target class labels of shape `(num_labels, height, width)` to be fed to a model. They identify the + labels of `mask_labels`, e.g. the label of `mask_labels[i][j]` if `class_labels[i][j]`. + patch_offsets (`list[torch.Tensor]`, *optional*): + list of tuples indicating the image index and start and end positions of patches for semantic segementation. + """ + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + + all_hidden_states = () if output_hidden_states else None + all_attentions = () if output_attentions else None + + masks_queries_logits_per_layer, class_queries_logits_per_layer = (), () + attention_mask = None + + if pixel_values is None: + raise ValueError("You have to specify pixel_values") + + hidden_states = self.embeddings(pixel_values) + + for idx, layer_module in enumerate(self.layers): + if output_hidden_states: + all_hidden_states += (hidden_states,) + + if idx == self.num_hidden_layers - self.config.num_blocks: + query = self.query.weight[None, :, :].expand(hidden_states.shape[0], -1, -1).to(hidden_states.device) + hidden_states = torch.cat((query, hidden_states), dim=1) + + if idx >= self.num_hidden_layers - self.config.num_blocks and ( + self.training or self.attn_mask_probs[idx - self.num_hidden_layers + self.config.num_blocks] > 0 + ): + norm_hidden_states = self.layernorm(hidden_states) + masks_queries_logits, class_queries_logits = self.predict(norm_hidden_states) + + masks_queries_logits_per_layer += (masks_queries_logits,) + class_queries_logits_per_layer += (class_queries_logits,) + + attention_mask = torch.ones( + hidden_states.shape[0], + hidden_states.shape[1], + hidden_states.shape[1], + device=hidden_states.device, + dtype=torch.bool, + ) + + interpolated_logits = F.interpolate(masks_queries_logits, size=self.grid_size, mode="bilinear") + interpolated_logits = interpolated_logits.view( + interpolated_logits.size(0), interpolated_logits.size(1), -1 + ) + + num_query_tokens = self.config.num_queries + encoder_start_tokens = num_query_tokens + self.embeddings.num_prefix_tokens + + # Set attention mask for queries to focus on encoder tokens based on interpolated logits + attention_mask[:, :num_query_tokens, encoder_start_tokens:] = interpolated_logits > 0 + + # Disable attention mask for random query tokens. + attention_mask = self._disable_attention_mask( + attention_mask, + prob=self.attn_mask_probs[idx - self.num_hidden_layers + self.config.num_blocks], + num_query_tokens=num_query_tokens, + encoder_start_tokens=encoder_start_tokens, + device=attention_mask.device, + ) + + # Expand attention mask to 4d mask. + attention_mask = attention_mask[:, None, ...].expand(-1, self.config.num_attention_heads, -1, -1) + attention_mask = attention_mask.float().masked_fill(~attention_mask, -1e9) + + layer_outputs = layer_module(hidden_states, attention_mask, output_attentions) + hidden_states = layer_outputs[0] + + if output_attentions: + all_attentions += (layer_outputs[1],) + + sequence_output = self.layernorm(hidden_states) + if output_hidden_states: + all_hidden_states += (sequence_output,) + + masks_queries_logits, class_queries_logits = self.predict(sequence_output) + masks_queries_logits_per_layer += (masks_queries_logits,) + class_queries_logits_per_layer += (class_queries_logits,) + + loss = None + if mask_labels is not None and class_labels is not None: + loss = 0.0 + for masks_queries_logits, class_queries_logits in zip( + masks_queries_logits_per_layer, class_queries_logits_per_layer + ): + loss_dict = self.get_loss_dict( + masks_queries_logits=masks_queries_logits, + class_queries_logits=class_queries_logits, + mask_labels=mask_labels, + class_labels=class_labels, + auxiliary_predictions=None, + ) + loss += self.get_loss(loss_dict) + + return EomtForUniversalSegmentationOutput( + loss=loss, + masks_queries_logits=masks_queries_logits, + class_queries_logits=class_queries_logits, + last_hidden_state=sequence_output, + hidden_states=all_hidden_states, + attentions=all_attentions, + patch_offsets=patch_offsets, + ) + + +__all__ = ["EomtConfig", "EomtPreTrainedModel", "EomtForUniversalSegmentation"] diff --git a/transformers/src/transformers/models/fastspeech2_conformer/__init__.py b/transformers/src/transformers/models/fastspeech2_conformer/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..44d1ec7236310774ed6b1379683c144d7f93ecce --- /dev/null +++ b/transformers/src/transformers/models/fastspeech2_conformer/__init__.py @@ -0,0 +1,28 @@ +# Copyright 2024 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import TYPE_CHECKING + +from ...utils import _LazyModule +from ...utils.import_utils import define_import_structure + + +if TYPE_CHECKING: + from .configuration_fastspeech2_conformer import * + from .modeling_fastspeech2_conformer import * + from .tokenization_fastspeech2_conformer import * +else: + import sys + + _file = globals()["__file__"] + sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__) diff --git a/transformers/src/transformers/models/fastspeech2_conformer/configuration_fastspeech2_conformer.py b/transformers/src/transformers/models/fastspeech2_conformer/configuration_fastspeech2_conformer.py new file mode 100644 index 0000000000000000000000000000000000000000..89d65a261c64fbabe493aa37677aaacb6f226a3b --- /dev/null +++ b/transformers/src/transformers/models/fastspeech2_conformer/configuration_fastspeech2_conformer.py @@ -0,0 +1,480 @@ +# coding=utf-8 +# Copyright 2023 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""FastSpeech2Conformer model configuration""" + +from typing import Optional + +from ...configuration_utils import PretrainedConfig +from ...utils import logging + + +logger = logging.get_logger(__name__) + + +class FastSpeech2ConformerConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`FastSpeech2ConformerModel`]. It is used to + instantiate a FastSpeech2Conformer model according to the specified arguments, defining the model architecture. + Instantiating a configuration with the defaults will yield a similar configuration to that of the + FastSpeech2Conformer [espnet/fastspeech2_conformer](https://huggingface.co/espnet/fastspeech2_conformer) + architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + Args: + hidden_size (`int`, *optional*, defaults to 384): + The dimensionality of the hidden layers. + vocab_size (`int`, *optional*, defaults to 78): + The size of the vocabulary. + num_mel_bins (`int`, *optional*, defaults to 80): + The number of mel filters used in the filter bank. + encoder_num_attention_heads (`int`, *optional*, defaults to 2): + The number of attention heads in the encoder. + encoder_layers (`int`, *optional*, defaults to 4): + The number of layers in the encoder. + encoder_linear_units (`int`, *optional*, defaults to 1536): + The number of units in the linear layer of the encoder. + decoder_layers (`int`, *optional*, defaults to 4): + The number of layers in the decoder. + decoder_num_attention_heads (`int`, *optional*, defaults to 2): + The number of attention heads in the decoder. + decoder_linear_units (`int`, *optional*, defaults to 1536): + The number of units in the linear layer of the decoder. + speech_decoder_postnet_layers (`int`, *optional*, defaults to 5): + The number of layers in the post-net of the speech decoder. + speech_decoder_postnet_units (`int`, *optional*, defaults to 256): + The number of units in the post-net layers of the speech decoder. + speech_decoder_postnet_kernel (`int`, *optional*, defaults to 5): + The kernel size in the post-net of the speech decoder. + positionwise_conv_kernel_size (`int`, *optional*, defaults to 3): + The size of the convolution kernel used in the position-wise layer. + encoder_normalize_before (`bool`, *optional*, defaults to `False`): + Specifies whether to normalize before encoder layers. + decoder_normalize_before (`bool`, *optional*, defaults to `False`): + Specifies whether to normalize before decoder layers. + encoder_concat_after (`bool`, *optional*, defaults to `False`): + Specifies whether to concatenate after encoder layers. + decoder_concat_after (`bool`, *optional*, defaults to `False`): + Specifies whether to concatenate after decoder layers. + reduction_factor (`int`, *optional*, defaults to 1): + The factor by which the speech frame rate is reduced. + speaking_speed (`float`, *optional*, defaults to 1.0): + The speed of the speech produced. + use_macaron_style_in_conformer (`bool`, *optional*, defaults to `True`): + Specifies whether to use macaron style in the conformer. + use_cnn_in_conformer (`bool`, *optional*, defaults to `True`): + Specifies whether to use convolutional neural networks in the conformer. + encoder_kernel_size (`int`, *optional*, defaults to 7): + The kernel size used in the encoder. + decoder_kernel_size (`int`, *optional*, defaults to 31): + The kernel size used in the decoder. + duration_predictor_layers (`int`, *optional*, defaults to 2): + The number of layers in the duration predictor. + duration_predictor_channels (`int`, *optional*, defaults to 256): + The number of channels in the duration predictor. + duration_predictor_kernel_size (`int`, *optional*, defaults to 3): + The kernel size used in the duration predictor. + energy_predictor_layers (`int`, *optional*, defaults to 2): + The number of layers in the energy predictor. + energy_predictor_channels (`int`, *optional*, defaults to 256): + The number of channels in the energy predictor. + energy_predictor_kernel_size (`int`, *optional*, defaults to 3): + The kernel size used in the energy predictor. + energy_predictor_dropout (`float`, *optional*, defaults to 0.5): + The dropout rate in the energy predictor. + energy_embed_kernel_size (`int`, *optional*, defaults to 1): + The kernel size used in the energy embed layer. + energy_embed_dropout (`float`, *optional*, defaults to 0.0): + The dropout rate in the energy embed layer. + stop_gradient_from_energy_predictor (`bool`, *optional*, defaults to `False`): + Specifies whether to stop gradients from the energy predictor. + pitch_predictor_layers (`int`, *optional*, defaults to 5): + The number of layers in the pitch predictor. + pitch_predictor_channels (`int`, *optional*, defaults to 256): + The number of channels in the pitch predictor. + pitch_predictor_kernel_size (`int`, *optional*, defaults to 5): + The kernel size used in the pitch predictor. + pitch_predictor_dropout (`float`, *optional*, defaults to 0.5): + The dropout rate in the pitch predictor. + pitch_embed_kernel_size (`int`, *optional*, defaults to 1): + The kernel size used in the pitch embed layer. + pitch_embed_dropout (`float`, *optional*, defaults to 0.0): + The dropout rate in the pitch embed layer. + stop_gradient_from_pitch_predictor (`bool`, *optional*, defaults to `True`): + Specifies whether to stop gradients from the pitch predictor. + encoder_dropout_rate (`float`, *optional*, defaults to 0.2): + The dropout rate in the encoder. + encoder_positional_dropout_rate (`float`, *optional*, defaults to 0.2): + The positional dropout rate in the encoder. + encoder_attention_dropout_rate (`float`, *optional*, defaults to 0.2): + The attention dropout rate in the encoder. + decoder_dropout_rate (`float`, *optional*, defaults to 0.2): + The dropout rate in the decoder. + decoder_positional_dropout_rate (`float`, *optional*, defaults to 0.2): + The positional dropout rate in the decoder. + decoder_attention_dropout_rate (`float`, *optional*, defaults to 0.2): + The attention dropout rate in the decoder. + duration_predictor_dropout_rate (`float`, *optional*, defaults to 0.2): + The dropout rate in the duration predictor. + speech_decoder_postnet_dropout (`float`, *optional*, defaults to 0.5): + The dropout rate in the speech decoder postnet. + max_source_positions (`int`, *optional*, defaults to 5000): + if `"relative"` position embeddings are used, defines the maximum source input positions. + use_masking (`bool`, *optional*, defaults to `True`): + Specifies whether to use masking in the model. + use_weighted_masking (`bool`, *optional*, defaults to `False`): + Specifies whether to use weighted masking in the model. + num_speakers (`int`, *optional*): + Number of speakers. If set to > 1, assume that the speaker ids will be provided as the input and use + speaker id embedding layer. + num_languages (`int`, *optional*): + Number of languages. If set to > 1, assume that the language ids will be provided as the input and use the + language id embedding layer. + speaker_embed_dim (`int`, *optional*): + Speaker embedding dimension. If set to > 0, assume that speaker_embedding will be provided as the input. + is_encoder_decoder (`bool`, *optional*, defaults to `True`): + Specifies whether the model is an encoder-decoder. + + Example: + + ```python + >>> from transformers import FastSpeech2ConformerModel, FastSpeech2ConformerConfig + + >>> # Initializing a FastSpeech2Conformer style configuration + >>> configuration = FastSpeech2ConformerConfig() + + >>> # Initializing a model from the FastSpeech2Conformer style configuration + >>> model = FastSpeech2ConformerModel(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + + model_type = "fastspeech2_conformer" + base_config_key = "model_config" + attribute_map = {"num_hidden_layers": "encoder_layers", "num_attention_heads": "encoder_num_attention_heads"} + + def __init__( + self, + hidden_size=384, + vocab_size=78, + num_mel_bins=80, + encoder_num_attention_heads=2, + encoder_layers=4, + encoder_linear_units=1536, + decoder_layers=4, + decoder_num_attention_heads=2, + decoder_linear_units=1536, + speech_decoder_postnet_layers=5, + speech_decoder_postnet_units=256, + speech_decoder_postnet_kernel=5, + positionwise_conv_kernel_size=3, + encoder_normalize_before=False, + decoder_normalize_before=False, + encoder_concat_after=False, + decoder_concat_after=False, + reduction_factor=1, + speaking_speed=1.0, + use_macaron_style_in_conformer=True, + use_cnn_in_conformer=True, + encoder_kernel_size=7, + decoder_kernel_size=31, + duration_predictor_layers=2, + duration_predictor_channels=256, + duration_predictor_kernel_size=3, + energy_predictor_layers=2, + energy_predictor_channels=256, + energy_predictor_kernel_size=3, + energy_predictor_dropout=0.5, + energy_embed_kernel_size=1, + energy_embed_dropout=0.0, + stop_gradient_from_energy_predictor=False, + pitch_predictor_layers=5, + pitch_predictor_channels=256, + pitch_predictor_kernel_size=5, + pitch_predictor_dropout=0.5, + pitch_embed_kernel_size=1, + pitch_embed_dropout=0.0, + stop_gradient_from_pitch_predictor=True, + encoder_dropout_rate=0.2, + encoder_positional_dropout_rate=0.2, + encoder_attention_dropout_rate=0.2, + decoder_dropout_rate=0.2, + decoder_positional_dropout_rate=0.2, + decoder_attention_dropout_rate=0.2, + duration_predictor_dropout_rate=0.2, + speech_decoder_postnet_dropout=0.5, + max_source_positions=5000, + use_masking=True, + use_weighted_masking=False, + num_speakers=None, + num_languages=None, + speaker_embed_dim=None, + is_encoder_decoder=True, + **kwargs, + ): + if positionwise_conv_kernel_size % 2 == 0: + raise ValueError( + f"positionwise_conv_kernel_size must be odd, but got {positionwise_conv_kernel_size} instead." + ) + if encoder_kernel_size % 2 == 0: + raise ValueError(f"encoder_kernel_size must be odd, but got {encoder_kernel_size} instead.") + if decoder_kernel_size % 2 == 0: + raise ValueError(f"decoder_kernel_size must be odd, but got {decoder_kernel_size} instead.") + if duration_predictor_kernel_size % 2 == 0: + raise ValueError( + f"duration_predictor_kernel_size must be odd, but got {duration_predictor_kernel_size} instead." + ) + if energy_predictor_kernel_size % 2 == 0: + raise ValueError( + f"energy_predictor_kernel_size must be odd, but got {energy_predictor_kernel_size} instead." + ) + if energy_embed_kernel_size % 2 == 0: + raise ValueError(f"energy_embed_kernel_size must be odd, but got {energy_embed_kernel_size} instead.") + if pitch_predictor_kernel_size % 2 == 0: + raise ValueError( + f"pitch_predictor_kernel_size must be odd, but got {pitch_predictor_kernel_size} instead." + ) + if pitch_embed_kernel_size % 2 == 0: + raise ValueError(f"pitch_embed_kernel_size must be odd, but got {pitch_embed_kernel_size} instead.") + if hidden_size % encoder_num_attention_heads != 0: + raise ValueError("The hidden_size must be evenly divisible by encoder_num_attention_heads.") + if hidden_size % decoder_num_attention_heads != 0: + raise ValueError("The hidden_size must be evenly divisible by decoder_num_attention_heads.") + if use_masking and use_weighted_masking: + raise ValueError("Either use_masking or use_weighted_masking can be True, but not both.") + + self.hidden_size = hidden_size + self.vocab_size = vocab_size + self.num_mel_bins = num_mel_bins + self.encoder_config = { + "num_attention_heads": encoder_num_attention_heads, + "layers": encoder_layers, + "kernel_size": encoder_kernel_size, + "attention_dropout_rate": encoder_attention_dropout_rate, + "dropout_rate": encoder_dropout_rate, + "positional_dropout_rate": encoder_positional_dropout_rate, + "linear_units": encoder_linear_units, + "normalize_before": encoder_normalize_before, + "concat_after": encoder_concat_after, + } + self.decoder_config = { + "num_attention_heads": decoder_num_attention_heads, + "layers": decoder_layers, + "kernel_size": decoder_kernel_size, + "attention_dropout_rate": decoder_attention_dropout_rate, + "dropout_rate": decoder_dropout_rate, + "positional_dropout_rate": decoder_positional_dropout_rate, + "linear_units": decoder_linear_units, + "normalize_before": decoder_normalize_before, + "concat_after": decoder_concat_after, + } + self.encoder_num_attention_heads = encoder_num_attention_heads + self.encoder_layers = encoder_layers + self.duration_predictor_channels = duration_predictor_channels + self.duration_predictor_kernel_size = duration_predictor_kernel_size + self.duration_predictor_layers = duration_predictor_layers + self.energy_embed_dropout = energy_embed_dropout + self.energy_embed_kernel_size = energy_embed_kernel_size + self.energy_predictor_channels = energy_predictor_channels + self.energy_predictor_dropout = energy_predictor_dropout + self.energy_predictor_kernel_size = energy_predictor_kernel_size + self.energy_predictor_layers = energy_predictor_layers + self.pitch_embed_dropout = pitch_embed_dropout + self.pitch_embed_kernel_size = pitch_embed_kernel_size + self.pitch_predictor_channels = pitch_predictor_channels + self.pitch_predictor_dropout = pitch_predictor_dropout + self.pitch_predictor_kernel_size = pitch_predictor_kernel_size + self.pitch_predictor_layers = pitch_predictor_layers + self.positionwise_conv_kernel_size = positionwise_conv_kernel_size + self.speech_decoder_postnet_units = speech_decoder_postnet_units + self.speech_decoder_postnet_dropout = speech_decoder_postnet_dropout + self.speech_decoder_postnet_kernel = speech_decoder_postnet_kernel + self.speech_decoder_postnet_layers = speech_decoder_postnet_layers + self.reduction_factor = reduction_factor + self.speaking_speed = speaking_speed + self.stop_gradient_from_energy_predictor = stop_gradient_from_energy_predictor + self.stop_gradient_from_pitch_predictor = stop_gradient_from_pitch_predictor + self.max_source_positions = max_source_positions + self.use_cnn_in_conformer = use_cnn_in_conformer + self.use_macaron_style_in_conformer = use_macaron_style_in_conformer + self.use_masking = use_masking + self.use_weighted_masking = use_weighted_masking + self.num_speakers = num_speakers + self.num_languages = num_languages + self.speaker_embed_dim = speaker_embed_dim + self.duration_predictor_dropout_rate = duration_predictor_dropout_rate + self.is_encoder_decoder = is_encoder_decoder + + super().__init__( + is_encoder_decoder=is_encoder_decoder, + **kwargs, + ) + + +class FastSpeech2ConformerHifiGanConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`FastSpeech2ConformerHifiGanModel`]. It is used to + instantiate a FastSpeech2Conformer HiFi-GAN vocoder model according to the specified arguments, defining the model + architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of the + FastSpeech2Conformer + [espnet/fastspeech2_conformer_hifigan](https://huggingface.co/espnet/fastspeech2_conformer_hifigan) architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + Args: + model_in_dim (`int`, *optional*, defaults to 80): + The number of frequency bins in the input log-mel spectrogram. + upsample_initial_channel (`int`, *optional*, defaults to 512): + The number of input channels into the upsampling network. + upsample_rates (`tuple[int]` or `list[int]`, *optional*, defaults to `[8, 8, 2, 2]`): + A tuple of integers defining the stride of each 1D convolutional layer in the upsampling network. The + length of *upsample_rates* defines the number of convolutional layers and has to match the length of + *upsample_kernel_sizes*. + upsample_kernel_sizes (`tuple[int]` or `list[int]`, *optional*, defaults to `[16, 16, 4, 4]`): + A tuple of integers defining the kernel size of each 1D convolutional layer in the upsampling network. The + length of *upsample_kernel_sizes* defines the number of convolutional layers and has to match the length of + *upsample_rates*. + resblock_kernel_sizes (`tuple[int]` or `list[int]`, *optional*, defaults to `[3, 7, 11]`): + A tuple of integers defining the kernel sizes of the 1D convolutional layers in the multi-receptive field + fusion (MRF) module. + resblock_dilation_sizes (`tuple[tuple[int]]` or `list[list[int]]`, *optional*, defaults to `[[1, 3, 5], [1, 3, 5], [1, 3, 5]]`): + A nested tuple of integers defining the dilation rates of the dilated 1D convolutional layers in the + multi-receptive field fusion (MRF) module. + initializer_range (`float`, *optional*, defaults to 0.01): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + leaky_relu_slope (`float`, *optional*, defaults to 0.1): + The angle of the negative slope used by the leaky ReLU activation. + normalize_before (`bool`, *optional*, defaults to `True`): + Whether or not to normalize the spectrogram before vocoding using the vocoder's learned mean and variance. + + Example: + + ```python + >>> from transformers import FastSpeech2ConformerHifiGan, FastSpeech2ConformerHifiGanConfig + + >>> # Initializing a FastSpeech2ConformerHifiGan configuration + >>> configuration = FastSpeech2ConformerHifiGanConfig() + + >>> # Initializing a model (with random weights) from the configuration + >>> model = FastSpeech2ConformerHifiGan(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + + model_type = "hifigan" + base_config_key = "vocoder_config" + + def __init__( + self, + model_in_dim=80, + upsample_initial_channel=512, + upsample_rates=[8, 8, 2, 2], + upsample_kernel_sizes=[16, 16, 4, 4], + resblock_kernel_sizes=[3, 7, 11], + resblock_dilation_sizes=[[1, 3, 5], [1, 3, 5], [1, 3, 5]], + initializer_range=0.01, + leaky_relu_slope=0.1, + normalize_before=True, + **kwargs, + ): + self.model_in_dim = model_in_dim + self.upsample_initial_channel = upsample_initial_channel + self.upsample_rates = upsample_rates + self.upsample_kernel_sizes = upsample_kernel_sizes + self.resblock_kernel_sizes = resblock_kernel_sizes + self.resblock_dilation_sizes = resblock_dilation_sizes + self.initializer_range = initializer_range + self.leaky_relu_slope = leaky_relu_slope + self.normalize_before = normalize_before + super().__init__(**kwargs) + + +class FastSpeech2ConformerWithHifiGanConfig(PretrainedConfig): + """ + This is the configuration class to store the configuration of a [`FastSpeech2ConformerWithHifiGan`]. It is used to + instantiate a `FastSpeech2ConformerWithHifiGanModel` model according to the specified sub-models configurations, + defining the model architecture. + + Instantiating a configuration with the defaults will yield a similar configuration to that of the + FastSpeech2ConformerModel [espnet/fastspeech2_conformer](https://huggingface.co/espnet/fastspeech2_conformer) and + FastSpeech2ConformerHifiGan + [espnet/fastspeech2_conformer_hifigan](https://huggingface.co/espnet/fastspeech2_conformer_hifigan) architectures. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + Args: + model_config (`typing.Dict`, *optional*): + Configuration of the text-to-speech model. + vocoder_config (`typing.Dict`, *optional*): + Configuration of the vocoder model. + model_config ([`FastSpeech2ConformerConfig`], *optional*): + Configuration of the text-to-speech model. + vocoder_config ([`FastSpeech2ConformerHiFiGanConfig`], *optional*): + Configuration of the vocoder model. + + Example: + + ```python + >>> from transformers import ( + ... FastSpeech2ConformerConfig, + ... FastSpeech2ConformerHifiGanConfig, + ... FastSpeech2ConformerWithHifiGanConfig, + ... FastSpeech2ConformerWithHifiGan, + ... ) + + >>> # Initializing FastSpeech2ConformerWithHifiGan sub-modules configurations. + >>> model_config = FastSpeech2ConformerConfig() + >>> vocoder_config = FastSpeech2ConformerHifiGanConfig() + + >>> # Initializing a FastSpeech2ConformerWithHifiGan module style configuration + >>> configuration = FastSpeech2ConformerWithHifiGanConfig(model_config.to_dict(), vocoder_config.to_dict()) + + >>> # Initializing a model (with random weights) + >>> model = FastSpeech2ConformerWithHifiGan(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ``` + """ + + model_type = "fastspeech2_conformer_with_hifigan" + sub_configs = {"model_config": FastSpeech2ConformerConfig, "vocoder_config": FastSpeech2ConformerHifiGanConfig} + + def __init__( + self, + model_config: Optional[dict] = None, + vocoder_config: Optional[dict] = None, + **kwargs, + ): + if model_config is None: + model_config = {} + logger.info("model_config is None. initializing the model with default values.") + + if vocoder_config is None: + vocoder_config = {} + logger.info("vocoder_config is None. initializing the coarse model with default values.") + + self.model_config = FastSpeech2ConformerConfig(**model_config) + self.vocoder_config = FastSpeech2ConformerHifiGanConfig(**vocoder_config) + + super().__init__(**kwargs) + + +__all__ = ["FastSpeech2ConformerConfig", "FastSpeech2ConformerHifiGanConfig", "FastSpeech2ConformerWithHifiGanConfig"] diff --git a/transformers/src/transformers/models/fastspeech2_conformer/convert_fastspeech2_conformer_original_pytorch_checkpoint_to_pytorch.py b/transformers/src/transformers/models/fastspeech2_conformer/convert_fastspeech2_conformer_original_pytorch_checkpoint_to_pytorch.py new file mode 100644 index 0000000000000000000000000000000000000000..3a5bb2d2e2e92481739184d30846c5ee1c987f40 --- /dev/null +++ b/transformers/src/transformers/models/fastspeech2_conformer/convert_fastspeech2_conformer_original_pytorch_checkpoint_to_pytorch.py @@ -0,0 +1,210 @@ +# coding=utf-8 +# Copyright 2023 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Convert FastSpeech2Conformer checkpoint.""" + +import argparse +import json +import re +from pathlib import Path +from tempfile import TemporaryDirectory + +import torch +import yaml + +from transformers import ( + FastSpeech2ConformerConfig, + FastSpeech2ConformerModel, + FastSpeech2ConformerTokenizer, + logging, +) + + +logging.set_verbosity_info() +logger = logging.get_logger("transformers.models.FastSpeech2Conformer") + +CONFIG_MAPPING = { + "adim": "hidden_size", + "aheads": "num_attention_heads", + "conformer_dec_kernel_size": "decoder_kernel_size", + "conformer_enc_kernel_size": "encoder_kernel_size", + "decoder_normalize_before": "decoder_normalize_before", + "dlayers": "decoder_layers", + "dunits": "decoder_linear_units", + "duration_predictor_chans": "duration_predictor_channels", + "duration_predictor_kernel_size": "duration_predictor_kernel_size", + "duration_predictor_layers": "duration_predictor_layers", + "elayers": "encoder_layers", + "encoder_normalize_before": "encoder_normalize_before", + "energy_embed_dropout": "energy_embed_dropout", + "energy_embed_kernel_size": "energy_embed_kernel_size", + "energy_predictor_chans": "energy_predictor_channels", + "energy_predictor_dropout": "energy_predictor_dropout", + "energy_predictor_kernel_size": "energy_predictor_kernel_size", + "energy_predictor_layers": "energy_predictor_layers", + "eunits": "encoder_linear_units", + "pitch_embed_dropout": "pitch_embed_dropout", + "pitch_embed_kernel_size": "pitch_embed_kernel_size", + "pitch_predictor_chans": "pitch_predictor_channels", + "pitch_predictor_dropout": "pitch_predictor_dropout", + "pitch_predictor_kernel_size": "pitch_predictor_kernel_size", + "pitch_predictor_layers": "pitch_predictor_layers", + "positionwise_conv_kernel_size": "positionwise_conv_kernel_size", + "postnet_chans": "speech_decoder_postnet_units", + "postnet_filts": "speech_decoder_postnet_kernel", + "postnet_layers": "speech_decoder_postnet_layers", + "reduction_factor": "reduction_factor", + "stop_gradient_from_energy_predictor": "stop_gradient_from_energy_predictor", + "stop_gradient_from_pitch_predictor": "stop_gradient_from_pitch_predictor", + "transformer_dec_attn_dropout_rate": "decoder_attention_dropout_rate", + "transformer_dec_dropout_rate": "decoder_dropout_rate", + "transformer_dec_positional_dropout_rate": "decoder_positional_dropout_rate", + "transformer_enc_attn_dropout_rate": "encoder_attention_dropout_rate", + "transformer_enc_dropout_rate": "encoder_dropout_rate", + "transformer_enc_positional_dropout_rate": "encoder_positional_dropout_rate", + "use_cnn_in_conformer": "use_cnn_in_conformer", + "use_macaron_style_in_conformer": "use_macaron_style_in_conformer", + "use_masking": "use_masking", + "use_weighted_masking": "use_weighted_masking", + "idim": "input_dim", + "odim": "num_mel_bins", + "spk_embed_dim": "speaker_embed_dim", + "langs": "num_languages", + "spks": "num_speakers", +} + + +def remap_model_yaml_config(yaml_config_path): + with Path(yaml_config_path).open("r", encoding="utf-8") as f: + args = yaml.safe_load(f) + args = argparse.Namespace(**args) + + remapped_config = {} + + model_params = args.tts_conf["text2mel_params"] + # espnet_config_key -> hf_config_key, any keys not included are ignored + for espnet_config_key, hf_config_key in CONFIG_MAPPING.items(): + if espnet_config_key in model_params: + remapped_config[hf_config_key] = model_params[espnet_config_key] + + return remapped_config, args.g2p, args.token_list + + +def convert_espnet_state_dict_to_hf(state_dict): + new_state_dict = {} + for key in state_dict: + if "tts.generator.text2mel." in key: + new_key = key.replace("tts.generator.text2mel.", "") + if "postnet" in key: + new_key = new_key.replace("postnet.postnet", "speech_decoder_postnet.layers") + new_key = new_key.replace(".0.weight", ".conv.weight") + new_key = new_key.replace(".1.weight", ".batch_norm.weight") + new_key = new_key.replace(".1.bias", ".batch_norm.bias") + new_key = new_key.replace(".1.running_mean", ".batch_norm.running_mean") + new_key = new_key.replace(".1.running_var", ".batch_norm.running_var") + new_key = new_key.replace(".1.num_batches_tracked", ".batch_norm.num_batches_tracked") + if "feat_out" in key: + if "weight" in key: + new_key = "speech_decoder_postnet.feat_out.weight" + if "bias" in key: + new_key = "speech_decoder_postnet.feat_out.bias" + if "encoder.embed.0.weight" in key: + new_key = new_key.replace("0.", "") + if "w_1" in key: + new_key = new_key.replace("w_1", "conv1") + if "w_2" in key: + new_key = new_key.replace("w_2", "conv2") + if "predictor.conv" in key: + new_key = new_key.replace(".conv", ".conv_layers") + pattern = r"(\d)\.(\d)" + replacement = ( + r"\1.conv" if ("2.weight" not in new_key) and ("2.bias" not in new_key) else r"\1.layer_norm" + ) + new_key = re.sub(pattern, replacement, new_key) + if "pitch_embed" in key or "energy_embed" in key: + new_key = new_key.replace("0", "conv") + if "encoders" in key: + new_key = new_key.replace("encoders", "conformer_layers") + new_key = new_key.replace("norm_final", "final_layer_norm") + new_key = new_key.replace("norm_mha", "self_attn_layer_norm") + new_key = new_key.replace("norm_ff_macaron", "ff_macaron_layer_norm") + new_key = new_key.replace("norm_ff", "ff_layer_norm") + new_key = new_key.replace("norm_conv", "conv_layer_norm") + if "lid_emb" in key: + new_key = new_key.replace("lid_emb", "language_id_embedding") + if "sid_emb" in key: + new_key = new_key.replace("sid_emb", "speaker_id_embedding") + + new_state_dict[new_key] = state_dict[key] + + return new_state_dict + + +@torch.no_grad() +def convert_FastSpeech2ConformerModel_checkpoint( + checkpoint_path, + yaml_config_path, + pytorch_dump_folder_path, + repo_id=None, +): + model_params, tokenizer_name, vocab = remap_model_yaml_config(yaml_config_path) + config = FastSpeech2ConformerConfig(**model_params) + + # Prepare the model + model = FastSpeech2ConformerModel(config) + + espnet_checkpoint = torch.load(checkpoint_path, weights_only=True) + hf_compatible_state_dict = convert_espnet_state_dict_to_hf(espnet_checkpoint) + + model.load_state_dict(hf_compatible_state_dict) + + model.save_pretrained(pytorch_dump_folder_path) + + # Prepare the tokenizer + with TemporaryDirectory() as tempdir: + vocab = {token: id for id, token in enumerate(vocab)} + vocab_file = Path(tempdir) / "vocab.json" + with open(vocab_file, "w") as f: + json.dump(vocab, f) + should_strip_spaces = "no_space" in tokenizer_name + tokenizer = FastSpeech2ConformerTokenizer(str(vocab_file), should_strip_spaces=should_strip_spaces) + + tokenizer.save_pretrained(pytorch_dump_folder_path) + + if repo_id: + print("Pushing to the hub...") + model.push_to_hub(repo_id) + tokenizer.push_to_hub(repo_id) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--checkpoint_path", required=True, default=None, type=str, help="Path to original checkpoint") + parser.add_argument( + "--yaml_config_path", required=True, default=None, type=str, help="Path to config.yaml of model to convert" + ) + parser.add_argument( + "--pytorch_dump_folder_path", required=True, default=None, type=str, help="Path to the output PyTorch model." + ) + parser.add_argument( + "--push_to_hub", default=None, type=str, help="Where to upload the converted model on the 🤗 hub." + ) + + args = parser.parse_args() + convert_FastSpeech2ConformerModel_checkpoint( + args.checkpoint_path, + args.yaml_config_path, + args.pytorch_dump_folder_path, + args.push_to_hub, + ) diff --git a/transformers/src/transformers/models/fastspeech2_conformer/convert_hifigan.py b/transformers/src/transformers/models/fastspeech2_conformer/convert_hifigan.py new file mode 100644 index 0000000000000000000000000000000000000000..70aada84bd5b49a28bbdcee605dfa035a20a20b9 --- /dev/null +++ b/transformers/src/transformers/models/fastspeech2_conformer/convert_hifigan.py @@ -0,0 +1,134 @@ +# coding=utf-8 +# Copyright 2023 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Convert FastSpeech2Conformer HiFi-GAN checkpoint.""" + +import argparse +from pathlib import Path + +import torch +import yaml + +from transformers import FastSpeech2ConformerHifiGan, FastSpeech2ConformerHifiGanConfig, logging + + +logging.set_verbosity_info() +logger = logging.get_logger("transformers.models.FastSpeech2Conformer") + + +def load_weights(checkpoint, hf_model, config): + vocoder_key_prefix = "tts.generator.vocoder." + checkpoint = {k.replace(vocoder_key_prefix, ""): v for k, v in checkpoint.items() if vocoder_key_prefix in k} + + hf_model.apply_weight_norm() + + hf_model.conv_pre.weight_g.data = checkpoint["input_conv.weight_g"] + hf_model.conv_pre.weight_v.data = checkpoint["input_conv.weight_v"] + hf_model.conv_pre.bias.data = checkpoint["input_conv.bias"] + + for i in range(len(config.upsample_rates)): + hf_model.upsampler[i].weight_g.data = checkpoint[f"upsamples.{i}.1.weight_g"] + hf_model.upsampler[i].weight_v.data = checkpoint[f"upsamples.{i}.1.weight_v"] + hf_model.upsampler[i].bias.data = checkpoint[f"upsamples.{i}.1.bias"] + + for i in range(len(config.upsample_rates) * len(config.resblock_kernel_sizes)): + for j in range(len(config.resblock_dilation_sizes)): + hf_model.resblocks[i].convs1[j].weight_g.data = checkpoint[f"blocks.{i}.convs1.{j}.1.weight_g"] + hf_model.resblocks[i].convs1[j].weight_v.data = checkpoint[f"blocks.{i}.convs1.{j}.1.weight_v"] + hf_model.resblocks[i].convs1[j].bias.data = checkpoint[f"blocks.{i}.convs1.{j}.1.bias"] + + hf_model.resblocks[i].convs2[j].weight_g.data = checkpoint[f"blocks.{i}.convs2.{j}.1.weight_g"] + hf_model.resblocks[i].convs2[j].weight_v.data = checkpoint[f"blocks.{i}.convs2.{j}.1.weight_v"] + hf_model.resblocks[i].convs2[j].bias.data = checkpoint[f"blocks.{i}.convs2.{j}.1.bias"] + + hf_model.conv_post.weight_g.data = checkpoint["output_conv.1.weight_g"] + hf_model.conv_post.weight_v.data = checkpoint["output_conv.1.weight_v"] + hf_model.conv_post.bias.data = checkpoint["output_conv.1.bias"] + + hf_model.remove_weight_norm() + + +def remap_hifigan_yaml_config(yaml_config_path): + with Path(yaml_config_path).open("r", encoding="utf-8") as f: + args = yaml.safe_load(f) + args = argparse.Namespace(**args) + + vocoder_type = args.tts_conf["vocoder_type"] + if vocoder_type != "hifigan_generator": + raise TypeError(f"Vocoder config must be for `hifigan_generator`, but got {vocoder_type}") + + remapped_dict = {} + vocoder_params = args.tts_conf["vocoder_params"] + + # espnet_config_key -> hf_config_key + key_mappings = { + "channels": "upsample_initial_channel", + "in_channels": "model_in_dim", + "resblock_dilations": "resblock_dilation_sizes", + "resblock_kernel_sizes": "resblock_kernel_sizes", + "upsample_kernel_sizes": "upsample_kernel_sizes", + "upsample_scales": "upsample_rates", + } + for espnet_config_key, hf_config_key in key_mappings.items(): + remapped_dict[hf_config_key] = vocoder_params[espnet_config_key] + remapped_dict["sampling_rate"] = args.tts_conf["sampling_rate"] + remapped_dict["normalize_before"] = False + remapped_dict["leaky_relu_slope"] = vocoder_params["nonlinear_activation_params"]["negative_slope"] + + return remapped_dict + + +@torch.no_grad() +def convert_hifigan_checkpoint( + checkpoint_path, + pytorch_dump_folder_path, + yaml_config_path=None, + repo_id=None, +): + if yaml_config_path is not None: + config_kwargs = remap_hifigan_yaml_config(yaml_config_path) + config = FastSpeech2ConformerHifiGanConfig(**config_kwargs) + else: + config = FastSpeech2ConformerHifiGanConfig() + + model = FastSpeech2ConformerHifiGan(config) + + orig_checkpoint = torch.load(checkpoint_path, weights_only=True) + load_weights(orig_checkpoint, model, config) + + model.save_pretrained(pytorch_dump_folder_path) + + if repo_id: + print("Pushing to the hub...") + model.push_to_hub(repo_id) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--checkpoint_path", required=True, default=None, type=str, help="Path to original checkpoint") + parser.add_argument("--yaml_config_path", default=None, type=str, help="Path to config.yaml of model to convert") + parser.add_argument( + "--pytorch_dump_folder_path", required=True, default=None, type=str, help="Path to the output PyTorch model." + ) + parser.add_argument( + "--push_to_hub", default=None, type=str, help="Where to upload the converted model on the 🤗 hub." + ) + + args = parser.parse_args() + convert_hifigan_checkpoint( + args.checkpoint_path, + args.pytorch_dump_folder_path, + args.yaml_config_path, + args.push_to_hub, + ) diff --git a/transformers/src/transformers/models/fastspeech2_conformer/convert_model_with_hifigan.py b/transformers/src/transformers/models/fastspeech2_conformer/convert_model_with_hifigan.py new file mode 100644 index 0000000000000000000000000000000000000000..6f840438dcaea99ab9ac51e8373bb1b5f753169c --- /dev/null +++ b/transformers/src/transformers/models/fastspeech2_conformer/convert_model_with_hifigan.py @@ -0,0 +1,102 @@ +# coding=utf-8 +# Copyright 2023 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Convert FastSpeech2Conformer checkpoint.""" + +import argparse + +import torch + +from transformers import ( + FastSpeech2ConformerConfig, + FastSpeech2ConformerHifiGan, + FastSpeech2ConformerHifiGanConfig, + FastSpeech2ConformerModel, + FastSpeech2ConformerWithHifiGan, + FastSpeech2ConformerWithHifiGanConfig, + logging, +) + +from .convert_fastspeech2_conformer_original_pytorch_checkpoint_to_pytorch import ( + convert_espnet_state_dict_to_hf, + remap_model_yaml_config, +) +from .convert_hifigan import load_weights, remap_hifigan_yaml_config + + +logging.set_verbosity_info() +logger = logging.get_logger("transformers.models.FastSpeech2Conformer") + + +def convert_FastSpeech2ConformerWithHifiGan_checkpoint( + checkpoint_path, + yaml_config_path, + pytorch_dump_folder_path, + repo_id=None, +): + # Prepare the model + model_params, *_ = remap_model_yaml_config(yaml_config_path) + model_config = FastSpeech2ConformerConfig(**model_params) + + model = FastSpeech2ConformerModel(model_config) + + espnet_checkpoint = torch.load(checkpoint_path, weights_only=True) + hf_compatible_state_dict = convert_espnet_state_dict_to_hf(espnet_checkpoint) + model.load_state_dict(hf_compatible_state_dict) + + # Prepare the vocoder + config_kwargs = remap_hifigan_yaml_config(yaml_config_path) + vocoder_config = FastSpeech2ConformerHifiGanConfig(**config_kwargs) + + vocoder = FastSpeech2ConformerHifiGan(vocoder_config) + load_weights(espnet_checkpoint, vocoder, vocoder_config) + + # Prepare the model + vocoder + config = FastSpeech2ConformerWithHifiGanConfig.from_sub_model_configs(model_config, vocoder_config) + with_hifigan_model = FastSpeech2ConformerWithHifiGan(config) + with_hifigan_model.model = model + with_hifigan_model.vocoder = vocoder + + with_hifigan_model.save_pretrained(pytorch_dump_folder_path) + + if repo_id: + print("Pushing to the hub...") + with_hifigan_model.push_to_hub(repo_id) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--checkpoint_path", required=True, default=None, type=str, help="Path to original checkpoint") + parser.add_argument( + "--yaml_config_path", required=True, default=None, type=str, help="Path to config.yaml of model to convert" + ) + parser.add_argument( + "--pytorch_dump_folder_path", + required=True, + default=None, + type=str, + help="Path to the output `FastSpeech2ConformerModel` PyTorch model.", + ) + parser.add_argument( + "--push_to_hub", default=None, type=str, help="Where to upload the converted model on the 🤗 hub." + ) + + args = parser.parse_args() + + convert_FastSpeech2ConformerWithHifiGan_checkpoint( + args.checkpoint_path, + args.yaml_config_path, + args.pytorch_dump_folder_path, + args.push_to_hub, + ) diff --git a/transformers/src/transformers/models/fastspeech2_conformer/modeling_fastspeech2_conformer.py b/transformers/src/transformers/models/fastspeech2_conformer/modeling_fastspeech2_conformer.py new file mode 100644 index 0000000000000000000000000000000000000000..f19ea88c176d48dc61c1ec7d9546b639ec3b2c0d --- /dev/null +++ b/transformers/src/transformers/models/fastspeech2_conformer/modeling_fastspeech2_conformer.py @@ -0,0 +1,1567 @@ +# coding=utf-8 +# Copyright 2023 The Espnet authors, IMS Toucan authors, and the HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""PyTorch FastSpeech2Conformer model.""" + +import math +from dataclasses import dataclass +from typing import Optional, Union + +import torch +from torch import nn + +from ...modeling_outputs import BaseModelOutput +from ...modeling_utils import PreTrainedModel +from ...utils import ModelOutput, auto_docstring, logging +from .configuration_fastspeech2_conformer import ( + FastSpeech2ConformerConfig, + FastSpeech2ConformerHifiGanConfig, + FastSpeech2ConformerWithHifiGanConfig, +) + + +logger = logging.get_logger(__name__) + + +@dataclass +@auto_docstring( + custom_intro=""" + Output type of [`FastSpeech2ConformerModel`]. + """ +) +class FastSpeech2ConformerModelOutput(ModelOutput): + r""" + loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided): + Spectrogram generation loss. + duration_outputs (`torch.LongTensor` of shape `(batch_size, max_text_length + 1)`, *optional*): + Outputs of the duration predictor. + pitch_outputs (`torch.FloatTensor` of shape `(batch_size, max_text_length + 1, 1)`, *optional*): + Outputs of the pitch predictor. + energy_outputs (`torch.FloatTensor` of shape `(batch_size, max_text_length + 1, 1)`, *optional*): + Outputs of the energy predictor. + """ + + loss: Optional[torch.FloatTensor] = None + spectrogram: Optional[torch.FloatTensor] = None + encoder_last_hidden_state: Optional[torch.FloatTensor] = None + encoder_hidden_states: Optional[tuple[torch.FloatTensor]] = None + encoder_attentions: Optional[tuple[torch.FloatTensor]] = None + decoder_hidden_states: Optional[tuple[torch.FloatTensor]] = None + decoder_attentions: Optional[tuple[torch.FloatTensor]] = None + duration_outputs: Optional[torch.LongTensor] = None + pitch_outputs: Optional[torch.FloatTensor] = None + energy_outputs: Optional[torch.FloatTensor] = None + + +@dataclass +@auto_docstring( + custom_intro=""" + Output type of [`FastSpeech2ConformerWithHifiGan`]. + """ +) +class FastSpeech2ConformerWithHifiGanOutput(FastSpeech2ConformerModelOutput): + r""" + loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided): + Spectrogram generation loss. + duration_outputs (`torch.LongTensor` of shape `(batch_size, max_text_length + 1)`, *optional*): + Outputs of the duration predictor. + pitch_outputs (`torch.FloatTensor` of shape `(batch_size, max_text_length + 1, 1)`, *optional*): + Outputs of the pitch predictor. + energy_outputs (`torch.FloatTensor` of shape `(batch_size, max_text_length + 1, 1)`, *optional*): + Outputs of the energy predictor. + waveform (`torch.FloatTensor` of shape `(batch_size, audio_length)`): + Speech output as a result of passing the predicted mel spectrogram through the vocoder. + """ + + waveform: Optional[torch.FloatTensor] = None + + +def length_regulator(encoded_embeddings, duration_labels, speaking_speed=1.0): + """ + Length regulator for feed-forward Transformer. + + This is the length regulator module described in `FastSpeech: Fast, Robust and Controllable Text to Speech` + https://huggingface.co/papers/1905.09263. The length regulator expands char or phoneme-level embedding features to + frame-level by repeating each feature based on the corresponding predicted durations. + + Args: + encoded_embeddings (`torch.Tensor` of shape `(batch_size, max_text_length, embedding_dim)`): + Batch of sequences of char or phoneme embeddings. + duration_labels (`torch.LongTensor` of shape `(batch_size, time)`): + Batch of durations of each frame. + speaking_speed (`float`, *optional*, defaults to 1.0): + Value to control speed of speech. + + Returns: + `torch.Tensor`: + Replicated input tensor based on durations (batch_size, time*, embedding_dim). + """ + + if speaking_speed <= 0: + raise ValueError("`speaking_speed` must be greater than 0.") + elif speaking_speed != 1.0: + duration_labels = torch.round(duration_labels.float() * speaking_speed).long() + + if duration_labels.sum() == 0: + duration_labels[duration_labels.sum(dim=1).eq(0)] = 1 + + # Calculate the maximum length needed + max_len = torch.sum(duration_labels, dim=1).max() + + # Create a padded tensor to hold the results + hidden_states = torch.zeros( + (encoded_embeddings.size(0), max_len, encoded_embeddings.size(2)), + dtype=torch.float, + device=encoded_embeddings.device, + ) + + # Loop through the batch and fill in the data + for i, (encoded_embedding, target_duration) in enumerate(zip(encoded_embeddings, duration_labels)): + repeated = torch.repeat_interleave(encoded_embedding, target_duration, dim=0) + hidden_states[i, : repeated.size(0)] = repeated + + return hidden_states + + +class FastSpeech2ConformerDurationPredictor(nn.Module): + """ + Duration predictor module. + + This is a module of duration predictor described in the paper 'FastSpeech: Fast, Robust and Controllable Text to + Speech' https://huggingface.co/papers/1905.09263 The duration predictor predicts a duration of each frame in log domain + from the hidden embeddings of encoder. + + Note: + The calculation domain of outputs is different between in `forward` and in `inference`. In `forward`, the + outputs are calculated in log domain but in `inference`, those are calculated in linear domain. + + """ + + def __init__(self, config: FastSpeech2ConformerConfig): + super().__init__() + + self.conv_layers = nn.ModuleList() + self.log_domain_offset = 1.0 + + for layer_idx in range(config.duration_predictor_layers): + num_chans = config.duration_predictor_channels + input_channels = config.hidden_size if layer_idx == 0 else num_chans + layer = FastSpeech2ConformerPredictorLayer( + input_channels, + num_chans, + config.duration_predictor_kernel_size, + config.duration_predictor_dropout_rate, + ) + self.conv_layers.append(layer) + self.linear = nn.Linear(config.duration_predictor_channels, 1) + + def forward(self, encoder_hidden_states): + """ + Args: + hidden_states (`torch.Tensor` of shape `(batch_size, max_text_length, input_dim)`): + Batch of input sequences. + padding_masks (`torch.ByteTensor` of shape `(batch_size, max_text_length)`, *optional*): + Batch of masks indicating padded part. + + Returns: + `torch.Tensor`: Batch of predicted durations in log domain `(batch_size, max_text_length)`. + + """ + # (batch_size, input_dim, max_text_length) + hidden_states = encoder_hidden_states.transpose(1, -1) + for layer in self.conv_layers: + hidden_states = layer(hidden_states) + + # NOTE: calculate in log domain, (batch_size, max_text_length) + hidden_states = self.linear(hidden_states.transpose(1, -1)).squeeze(-1) + + if not self.training: + # NOTE: calculate in linear domain + hidden_states = torch.clamp(torch.round(hidden_states.exp() - self.log_domain_offset), min=0).long() + + return hidden_states + + +# Copied from transformers.models.speecht5.modeling_speecht5.SpeechT5BatchNormConvLayer +class FastSpeech2ConformerBatchNormConvLayer(nn.Module): + def __init__(self, config, layer_id=0): + super().__init__() + + if layer_id == 0: + in_conv_dim = config.num_mel_bins + else: + in_conv_dim = config.speech_decoder_postnet_units + + if layer_id == config.speech_decoder_postnet_layers - 1: + out_conv_dim = config.num_mel_bins + else: + out_conv_dim = config.speech_decoder_postnet_units + + self.conv = nn.Conv1d( + in_conv_dim, + out_conv_dim, + kernel_size=config.speech_decoder_postnet_kernel, + stride=1, + padding=(config.speech_decoder_postnet_kernel - 1) // 2, + bias=False, + ) + self.batch_norm = nn.BatchNorm1d(out_conv_dim) + + if layer_id < config.speech_decoder_postnet_layers - 1: + self.activation = nn.Tanh() + else: + self.activation = None + + self.dropout = nn.Dropout(config.speech_decoder_postnet_dropout) + + def forward(self, hidden_states): + hidden_states = self.conv(hidden_states) + hidden_states = self.batch_norm(hidden_states) + if self.activation is not None: + hidden_states = self.activation(hidden_states) + hidden_states = self.dropout(hidden_states) + return hidden_states + + +class FastSpeech2ConformerSpeechDecoderPostnet(nn.Module): + def __init__(self, config): + super().__init__() + self.config = config + self.feat_out = nn.Linear(config.hidden_size, config.num_mel_bins * config.reduction_factor) + self.layers = nn.ModuleList( + [FastSpeech2ConformerBatchNormConvLayer(config, i) for i in range(config.speech_decoder_postnet_layers)] + ) + + def forward(self, hidden_states: torch.Tensor): + outputs_before_postnet = self.feat_out(hidden_states).view(hidden_states.size(0), -1, self.config.num_mel_bins) + layer_output = outputs_before_postnet.transpose(1, 2) + for layer in self.layers: + layer_output = layer(layer_output) + outputs_after_postnet = outputs_before_postnet + layer_output.transpose(1, 2) + return outputs_before_postnet, outputs_after_postnet + + +class FastSpeech2ConformerPredictorLayer(nn.Module): + def __init__(self, input_channels, num_chans, kernel_size, dropout_rate): + super().__init__() + self.conv = nn.Conv1d( + input_channels, + num_chans, + kernel_size, + stride=1, + padding=(kernel_size - 1) // 2, + ) + self.activation = nn.ReLU() + self.layer_norm = nn.LayerNorm(num_chans) + self.dropout = nn.Dropout(dropout_rate) + + def forward(self, hidden_states): + hidden_states = self.conv(hidden_states) + hidden_states = self.activation(hidden_states) + + # Perform layer norm on dimension 1 + hidden_states = hidden_states.transpose(1, -1) + hidden_states = self.layer_norm(hidden_states) + hidden_states = hidden_states.transpose(1, -1) + + hidden_states = self.dropout(hidden_states) + + return hidden_states + + +class FastSpeech2ConformerVariancePredictor(nn.Module): + def __init__( + self, + config: FastSpeech2ConformerConfig, + num_layers=2, + num_chans=384, + kernel_size=3, + dropout_rate=0.5, + ): + """ + Initialize variance predictor module. + + Args: + input_dim (`int`): Input dimension. + num_layers (`int`, *optional*, defaults to 2): Number of convolutional layers. + num_chans (`int`, *optional*, defaults to 384): Number of channels of convolutional layers. + kernel_size (`int`, *optional*, defaults to 3): Kernel size of convolutional layers. + dropout_rate (`float`, *optional*, defaults to 0.5): Dropout rate. + """ + super().__init__() + self.conv_layers = nn.ModuleList() + for idx in range(num_layers): + input_channels = config.hidden_size if idx == 0 else num_chans + layer = FastSpeech2ConformerPredictorLayer(input_channels, num_chans, kernel_size, dropout_rate) + self.conv_layers.append(layer) + self.linear = nn.Linear(num_chans, 1) + + def forward(self, encoder_hidden_states, padding_masks=None): + """ + Calculate forward propagation. + + Args: + encoder_hidden_states (`torch.Tensor` of shape `(batch_size, max_text_length, input_dim)`): + Batch of input sequences. + padding_masks (`torch.ByteTensor` of shape `(batch_size, max_text_length)`, *optional*): + Batch of masks indicating padded part. + + Returns: + Tensor: Batch of predicted sequences `(batch_size, max_text_length, 1)`. + """ + # (batch_size, input_dim, max_text_length) + hidden_states = encoder_hidden_states.transpose(1, -1) + for layer in self.conv_layers: + hidden_states = layer(hidden_states) + + hidden_states = self.linear(hidden_states.transpose(1, 2)) + + if padding_masks is not None: + hidden_states = hidden_states.masked_fill(padding_masks, 0.0) + + return hidden_states + + +class FastSpeech2ConformerVarianceEmbedding(nn.Module): + def __init__( + self, + in_channels=1, + out_channels=384, + kernel_size=1, + padding=0, + dropout_rate=0.0, + ): + super().__init__() + self.conv = nn.Conv1d( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=kernel_size, + padding=padding, + ) + self.dropout = nn.Dropout(dropout_rate) + + def forward(self, hidden_states): + hidden_states = hidden_states.transpose(1, 2) + hidden_states = self.conv(hidden_states) + hidden_states = self.dropout(hidden_states) + hidden_states = hidden_states.transpose(1, 2) + return hidden_states + + +class FastSpeech2ConformerAttention(nn.Module): + """ + Multi-Head attention layer with relative position encoding. Details can be found in + https://github.com/espnet/espnet/pull/2816. Paper: https://huggingface.co/papers/1901.02860. + """ + + def __init__(self, config: FastSpeech2ConformerConfig, module_config): + """Construct an FastSpeech2ConformerAttention object.""" + super().__init__() + # We assume d_v always equals dim_key + self.num_heads = module_config["num_attention_heads"] + self.hidden_size = config.hidden_size + self.dim_key = self.hidden_size // self.num_heads + self.head_dim = self.hidden_size // self.num_heads + self.linear_q = nn.Linear(self.hidden_size, self.hidden_size) + self.linear_k = nn.Linear(self.hidden_size, self.hidden_size) + self.linear_v = nn.Linear(self.hidden_size, self.hidden_size) + self.linear_out = nn.Linear(self.hidden_size, self.hidden_size) + self.dropout = nn.Dropout(p=module_config["attention_dropout_rate"]) + + # linear transformation for positional encoding + self.linear_pos = nn.Linear(self.hidden_size, self.hidden_size, bias=False) + # these two learnable bias are used in matrix c and matrix d + # as described in https://huggingface.co/papers/1901.02860 Section 3.3 + self.pos_bias_u = nn.Parameter(torch.Tensor(self.num_heads, self.head_dim)) + self.pos_bias_v = nn.Parameter(torch.Tensor(self.num_heads, self.head_dim)) + + def shift_relative_position_tensor(self, pos_tensor): + """ + Args: + pos_tensor (torch.Tensor of shape (batch_size, head, time1, 2*time1-1)): Input tensor. + """ + zero_pad = torch.zeros((*pos_tensor.size()[:3], 1), device=pos_tensor.device, dtype=pos_tensor.dtype) + pos_tensor_padded = torch.cat([zero_pad, pos_tensor], dim=-1) + + pos_tensor_padded = pos_tensor_padded.view(*pos_tensor.size()[:2], pos_tensor.size(3) + 1, pos_tensor.size(2)) + # only keep the positions from 0 to time2 + pos_tensor = pos_tensor_padded[:, :, 1:].view_as(pos_tensor)[:, :, :, : pos_tensor.size(-1) // 2 + 1] + + return pos_tensor + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + pos_emb: Optional[torch.Tensor] = None, + output_attentions: Optional[torch.Tensor] = False, + ) -> tuple[torch.Tensor, torch.Tensor]: + """ + Compute 'Scaled Dot Product Attention' with rel. positional encoding. + + Args: + hidden_states (`torch.Tensor` of shape `(batch, time2, size)`): Values of the hidden states + attention_mask (`torch.Tensor` of shape `(batch, time1, time2)`): Mask tensor. + pos_emb (`torch.Tensor` of shape `(batch, 2*time1-1, size)`): Positional embedding tensor. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. + Returns: + `torch.Tensor`: Output tensor of shape `(batch, time1, d_model)`. + """ + bsz, q_len, _ = hidden_states.size() + query_states = self.linear_q(hidden_states).view(bsz, -1, self.num_heads, self.head_dim) + key_states = self.linear_k(hidden_states).view(bsz, -1, self.num_heads, self.head_dim) + value_states = self.linear_v(hidden_states).view(bsz, -1, self.num_heads, self.head_dim) + + bsz_pos = pos_emb.size(0) + pos_encoding = self.linear_pos(pos_emb).view(bsz_pos, -1, self.num_heads, self.head_dim) + + # (batch_size, head, time1, dim_key) + query_with_bias_u = (query_states + self.pos_bias_u).transpose(1, 2) + # (batch_size, head, time1, dim_key) + query_with_bias_v = (query_states + self.pos_bias_v).transpose(1, 2) + + # compute attention score + # first compute matrix a and matrix c + # as described in https://huggingface.co/papers/1901.02860 Section 3.3 + # (batch_size, head, time1, time2) + matrix_ac = torch.matmul(query_with_bias_u, key_states.permute(0, 2, 3, 1)) + + # compute matrix b and matrix d + # (batch_size, head, time1, 2*time1-1) + matrix_bd = torch.matmul(query_with_bias_v, pos_encoding.permute(0, 2, 3, 1)) + matrix_bd = self.shift_relative_position_tensor(matrix_bd) + + # (batch_size, head, time1, time2) + scores = (matrix_ac + matrix_bd) / math.sqrt(self.dim_key) + + # Forward attention + if attention_mask is not None: + expected_size = (bsz, 1, q_len) + if attention_mask.size() != expected_size: + raise ValueError(f"Attention mask should be of size {expected_size}, but is {attention_mask.size()}") + attention_mask = attention_mask.unsqueeze(1).eq(0) + min_value = float(torch.finfo(scores.dtype).min) + scores = scores.masked_fill(attention_mask, min_value) + attn_weights = torch.softmax(scores, dim=-1).masked_fill(attention_mask, 0.0) + else: + attn_weights = torch.softmax(scores, dim=-1) + + attn_weights = self.dropout(attn_weights) + attn_output = torch.matmul(attn_weights, value_states.transpose(1, 2)) + attn_output = attn_output.transpose(1, 2).contiguous().view(bsz, q_len, -1) + + attn_output = self.linear_out(attn_output) + + if not output_attentions: + attn_weights = None + + return attn_output, attn_weights + + +class FastSpeech2ConformerConvolutionModule(nn.Module): + def __init__(self, config: FastSpeech2ConformerConfig, module_config): + super().__init__() + # kernel_size should be an odd number for 'SAME' padding + channels = config.hidden_size + kernel_size = module_config["kernel_size"] + self.pointwise_conv1 = nn.Conv1d(channels, 2 * channels, kernel_size=1, stride=1, padding=0, bias=True) + self.depthwise_conv = nn.Conv1d( + channels, channels, kernel_size, stride=1, padding=(kernel_size - 1) // 2, groups=channels, bias=True + ) + self.norm = nn.BatchNorm1d(channels) + self.pointwise_conv2 = nn.Conv1d(channels, channels, kernel_size=1, stride=1, padding=0, bias=True) + + def forward(self, hidden_states): + """ + Compute convolution module. + + Args: + hidden_states (`torch.Tensor` of shape `(batch, time, channels)`): Input tensor. + + Returns: + `torch.Tensor`: Output tensor of shape `(batch, time, channels)`. + + """ + # exchange the temporal dimension and the feature dimension + hidden_states = hidden_states.transpose(1, 2) + + # GLU mechanism, (batch_size, 2*channel, dim) + hidden_states = self.pointwise_conv1(hidden_states) + # (batch_size, channel, dim) + hidden_states = nn.functional.glu(hidden_states, dim=1) + + # 1D Depthwise Conv + hidden_states = self.depthwise_conv(hidden_states) + hidden_states = self.norm(hidden_states) + + hidden_states = hidden_states * torch.sigmoid(hidden_states) + + hidden_states = self.pointwise_conv2(hidden_states) + + return hidden_states.transpose(1, 2) + + +class FastSpeech2ConformerEncoderLayer(nn.Module): + def __init__(self, config: FastSpeech2ConformerConfig, module_config): + super().__init__() + + # self-attention module definition + self.self_attn = FastSpeech2ConformerAttention(config, module_config) + + # feed-forward module definition + self.feed_forward = FastSpeech2ConformerMultiLayeredConv1d(config, module_config) + + self.macaron_style = config.use_macaron_style_in_conformer + if self.macaron_style: + self.feed_forward_macaron = FastSpeech2ConformerMultiLayeredConv1d(config, module_config) + self.ff_macaron_layer_norm = nn.LayerNorm(config.hidden_size) + self.ff_scale = 0.5 + else: + self.ff_scale = 1.0 + + # convolution module definition + self.use_cnn_module = config.use_cnn_in_conformer + if self.use_cnn_module: + self.conv_module = FastSpeech2ConformerConvolutionModule(config, module_config) + self.conv_layer_norm = nn.LayerNorm(config.hidden_size) + self.final_layer_norm = nn.LayerNorm(config.hidden_size) + + self.ff_layer_norm = nn.LayerNorm(config.hidden_size) + + self.self_attn_layer_norm = nn.LayerNorm(config.hidden_size) + + self.dropout = nn.Dropout(module_config["dropout_rate"]) + self.size = config.hidden_size + self.normalize_before = module_config["normalize_before"] + self.concat_after = module_config["concat_after"] + if self.concat_after: + self.concat_linear = nn.Linear(config.hidden_size + config.hidden_size, config.hidden_size) + + def forward( + self, + hidden_states: torch.Tensor, + pos_emb: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + output_attentions: Optional[torch.Tensor] = False, + ): + """ + Compute encoded features. + + Args: + hidden_states (`torch.Tensor` of shape `(batch, time, size)`): Input tensor. + pos_emb (`torch.Tensor` of shape `(1, time, size)`): Positional embeddings tensor. + attention_mask (`torch.Tensor` of shape `(batch, time)`): Attention mask tensor for the input. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. + Returns: + `torch.Tensor`: Output tensor of shape `(batch, time, size)`. + + """ + # whether to use macaron style + if self.macaron_style: + residual = hidden_states + if self.normalize_before: + hidden_states = self.ff_macaron_layer_norm(hidden_states) + hidden_states = residual + self.ff_scale * self.dropout(self.feed_forward_macaron(hidden_states)) + if not self.normalize_before: + hidden_states = self.ff_macaron_layer_norm(hidden_states) + + # multi-headed self-attention module + residual = hidden_states + if self.normalize_before: + hidden_states = self.self_attn_layer_norm(hidden_states) + + attention_output, attention_scores = self.self_attn( + hidden_states, attention_mask=attention_mask, pos_emb=pos_emb, output_attentions=output_attentions + ) + + if self.concat_after: + x_concat = torch.cat((hidden_states, attention_output), dim=-1) + hidden_states = self.concat_linear(x_concat) + hidden_states = residual + hidden_states + else: + hidden_states = self.dropout(attention_output) + hidden_states = residual + hidden_states + if not self.normalize_before: + hidden_states = self.self_attn_layer_norm(hidden_states) + + # convolution module + if self.use_cnn_module: + residual = hidden_states + if self.normalize_before: + hidden_states = self.conv_layer_norm(hidden_states) + hidden_states = self.conv_module(hidden_states) + hidden_states = self.dropout(hidden_states) + hidden_states = residual + hidden_states + if not self.normalize_before: + hidden_states = self.conv_layer_norm(hidden_states) + + # feed forward module + residual = hidden_states + if self.normalize_before: + hidden_states = self.ff_layer_norm(hidden_states) + hidden_states = self.feed_forward(hidden_states) + hidden_states = self.dropout(hidden_states) + hidden_states = residual + self.ff_scale * hidden_states + if not self.normalize_before: + hidden_states = self.ff_layer_norm(hidden_states) + + if self.conv_module is not None: + hidden_states = self.final_layer_norm(hidden_states) + + outputs = (hidden_states,) + + if output_attentions: + outputs += (attention_scores,) + + return outputs + + +class FastSpeech2ConformerMultiLayeredConv1d(nn.Module): + """ + Multi-layered conv1d for Transformer block. + + This is a module of multi-layered conv1d designed to replace positionwise feed-forward network in Transformer + block, which is introduced in 'FastSpeech: Fast, Robust and Controllable Text to Speech' + https://huggingface.co/papers/1905.09263 + """ + + def __init__(self, config: FastSpeech2ConformerConfig, module_config): + """ + Initialize FastSpeech2ConformerMultiLayeredConv1d module. + + Args: + input_channels (`int`): Number of input channels. + hidden_channels (`int`): Number of hidden channels. + kernel_size (`int`): Kernel size of conv1d. + dropout_rate (`float`): Dropout rate. + """ + super().__init__() + input_channels = config.hidden_size + hidden_channels = module_config["linear_units"] + kernel_size = config.positionwise_conv_kernel_size + self.conv1 = nn.Conv1d(input_channels, hidden_channels, kernel_size, stride=1, padding=(kernel_size - 1) // 2) + self.conv2 = nn.Conv1d(hidden_channels, input_channels, kernel_size, stride=1, padding=(kernel_size - 1) // 2) + self.dropout = nn.Dropout(module_config["dropout_rate"]) + + def forward(self, hidden_states): + """ + Calculate forward propagation. + + Args: + hidden_states (torch.Tensor): Batch of input tensors (batch_size, time, input_channels). + + Returns: + torch.Tensor: Batch of output tensors (batch_size, time, hidden_channels). + """ + hidden_states = hidden_states.transpose(-1, 1) + hidden_states = self.conv1(hidden_states) + hidden_states = torch.relu(hidden_states) + hidden_states = self.dropout(hidden_states) + hidden_states = self.conv2(hidden_states) + hidden_states = hidden_states.transpose(-1, 1) + return hidden_states + + +class FastSpeech2ConformerRelPositionalEncoding(nn.Module): + """ + Args: + Relative positional encoding module (new implementation). Details can be found in + https://github.com/espnet/espnet/pull/2816. See : Appendix Batch in https://huggingface.co/papers/1901.02860 + config (`FastSpeech2ConformerConfig`): + FastSpeech2ConformerConfig instance. + module_config (`dict`): + Dictionary containing the encoder or decoder module configuration from the `FastSpeech2ConformerConfig`. + """ + + def __init__(self, config: FastSpeech2ConformerConfig, module_config): + """ + Construct an PositionalEncoding object. + """ + super().__init__() + self.embed_dim = config.hidden_size + self.input_scale = math.sqrt(self.embed_dim) + self.dropout = nn.Dropout(p=module_config["positional_dropout_rate"]) + self.pos_enc = None + self.max_len = 5000 + self.extend_pos_enc(torch.tensor(0.0).expand(1, self.max_len)) + + def extend_pos_enc(self, x): + """Reset the positional encodings.""" + if self.pos_enc is not None: + # self.pos_enc contains both positive and negative parts + # the length of self.pos_enc is 2 * input_len - 1 + if self.pos_enc.size(1) >= x.size(1) * 2 - 1: + if self.pos_enc.dtype != x.dtype or self.pos_enc.device != x.device: + self.pos_enc = self.pos_enc.to(dtype=x.dtype, device=x.device) + return + # Suppose `i` means to the position of query vector and `j` means the + # position of key vector. We use position relative positions when keys + # are to the left (i>j) and negative relative positions otherwise (i